From 7d51323240b8bf7dd82f2523759397aac08af97d Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Wed, 12 Nov 2025 10:58:00 +0800 Subject: [PATCH 01/14] hexagon: add buffer support checks for hexagon sessions --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index cabd301ad35..4b9aafa9e1d 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1912,6 +1912,21 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t return true; } +static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess) { + return true; +} + +template +static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, + ggml_backend_buffer_t buffer, + _TBuffers... buffers) { + if (buffer && (!ggml_backend_buffer_is_hexagon(buffer) || ggml_backend_hexagon_buffer_get_sess(buffer) != sess)) { + return false; + } + + return hex_supported_buffer(sess, buffers...); +} + static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -1959,16 +1974,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s } // src0 & src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { return false; } From fc68ce48aa806b587bc8d8e622bc6028ea1302f0 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Wed, 12 Nov 2025 11:03:37 +0800 Subject: [PATCH 02/14] refactor: simplify buffer support checks in hexagon operations --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 85 +++----------------------- 1 file changed, 7 insertions(+), 78 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 4b9aafa9e1d..99087ab890e 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2022,20 +2022,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session // src0 (weights) must be repacked and mapped to the same session // src1 & sr2 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (src2->buffer && - (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) { return false; } @@ -2069,16 +2056,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { return false; } @@ -2110,20 +2088,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (src2->buffer && - (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) { return false; } @@ -2150,12 +2115,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses } // src0 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, dst->buffer)) { return false; } @@ -2192,16 +2152,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1 && src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { return false; } @@ -2254,16 +2205,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1 && src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { return false; } @@ -2318,20 +2260,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess } // src0, src1, src2 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (src2 && src2->buffer && - (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) { return false; } From d1bfaddb34d848ef3f45d43491a217e9866a3888 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Wed, 12 Nov 2025 22:10:38 +0800 Subject: [PATCH 03/14] hexagon: update buffer support checks to use tensor structure --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 27 +++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 99087ab890e..f9d23bc92f3 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1916,15 +1916,16 @@ static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess return true; } -template +template static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, - ggml_backend_buffer_t buffer, - _TBuffers... buffers) { - if (buffer && (!ggml_backend_buffer_is_hexagon(buffer) || ggml_backend_hexagon_buffer_get_sess(buffer) != sess)) { + const ggml_tensor * t, + _TTensor... tensors) { + if (t && t->buffer && + (!ggml_backend_buffer_is_hexagon(t->buffer) || ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess)) { return false; } - return hex_supported_buffer(sess, buffers...); + return hex_supported_buffer(sess, tensors...); } static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { @@ -1974,7 +1975,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s } // src0 & src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2022,7 +2023,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session // src0 (weights) must be repacked and mapped to the same session // src1 & sr2 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { return false; } @@ -2056,7 +2057,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se } // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2088,7 +2089,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se } // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { return false; } @@ -2115,7 +2116,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses } // src0 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, dst)) { return false; } @@ -2152,7 +2153,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2205,7 +2206,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s } // src0, src1 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2260,7 +2261,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess } // src0, src1, src2 & dst must be mapped to the same session - if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) { + if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { return false; } From 02bb8fcd1cd21c870c95b3e34d56836df0119d07 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 20 Nov 2025 12:04:11 +0800 Subject: [PATCH 04/14] refactor: streamline buffer initialization for DSP queue in hexagon operations --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 103 +++++++++++-------------- 1 file changed, 47 insertions(+), 56 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index f9d23bc92f3..57f15e4ff13 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2282,6 +2282,45 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { h->nb[3] = t->nb[3]; } +template +static ggml_hexagon_session * dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt], + const bool is_src0_static, + const ggml_tensor * dst, + std::initializer_list srcs) { + GGML_ASSERT(_Cnt == srcs.size() + 1); + GGML_ASSERT(srcs.size() > 0); + + constexpr const auto buffer_init = [](dspqueue_buffer * buffer, const ggml_tensor * t, uint32_t flags) { + if (!t || !t->buffer) { + return; + } + + auto tensor_buf = static_cast(t->buffer->context); + buffer->fd = tensor_buf->fd; + buffer->ptr = t->data; + buffer->offset = (uint8_t *) t->data - tensor_buf->base; + buffer->size = ggml_nbytes(t); + buffer->flags = flags; + }; + + memset(bufs, 0, sizeof(bufs)); + buffer_init(&bufs[_Cnt - 1], dst, + DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); // Output buffer: flush CPU caches + + { + uint32_t src_flags = is_src0_static ? 0 : + (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + for (size_t i = 0; i < srcs.size(); i++) { + buffer_init(&bufs[i], srcs.begin()[i], src_flags); + src_flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + } + } + + return static_cast(dst->buffer->context)->sess; +} + static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) { auto buf = static_cast(t->buffer->context); auto sess = buf->sess; @@ -2296,10 +2335,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * dst = op; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1, t2; t1 = ggml_time_us(); @@ -2320,40 +2355,20 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - dspqueue_buffer bufs[3]; - memset(bufs, 0, sizeof(bufs)); - // First buffer Weights. // The content is static, there is no need to do any cache management - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = 0; - + // // Second buffer Input Activations. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - + // // Third buffer Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - bufs[2].fd = dst_buf->fd; - bufs[2].ptr = dst->data; - bufs[2].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[2].size = ggml_nbytes(dst); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); - - // Primary DSP session from the src0 (normally weight) tensor - auto sess = src0_buf->sess; + dspqueue_buffer bufs[3]; + auto sess = dspqueue_buffers_init(bufs, true, dst, { src0, src1 }); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; @@ -2517,9 +2532,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src1 = node->src[1]; const struct ggml_tensor * dst = node; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); uint64_t t1 = 0; uint64_t t2 = 0; @@ -2556,45 +2568,24 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { init_htp_tensor(&req.src1, src1); init_htp_tensor(&req.dst, dst); - dspqueue_buffer bufs[3]; - memset(bufs, 0, sizeof(bufs)); - // First buffer = First Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - + // // Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - + // // Third buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - bufs[2].fd = dst_buf->fd; - bufs[2].ptr = dst->data; - bufs[2].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[2].size = ggml_nbytes(dst); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); - - // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + dspqueue_buffer bufs[3]; + auto sess = dspqueue_buffers_init(bufs, false, dst, { src0, src1 }); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; From 0d9bda3c2c56c3e3b3881e0ed81f1ea23e0240c9 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 20 Nov 2025 21:33:31 +0800 Subject: [PATCH 05/14] refactor: simplify buffer initialization in DSP queue for hexagon operations --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 242 +++++++------------------ 1 file changed, 64 insertions(+), 178 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 57f15e4ff13..b4f1ff42cf7 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2283,18 +2283,14 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { } template -static ggml_hexagon_session * dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt], - const bool is_src0_static, - const ggml_tensor * dst, - std::initializer_list srcs) { +static size_t dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt], + const bool is_src0_static, + const ggml_tensor * dst, + std::initializer_list srcs) { GGML_ASSERT(_Cnt == srcs.size() + 1); GGML_ASSERT(srcs.size() > 0); constexpr const auto buffer_init = [](dspqueue_buffer * buffer, const ggml_tensor * t, uint32_t flags) { - if (!t || !t->buffer) { - return; - } - auto tensor_buf = static_cast(t->buffer->context); buffer->fd = tensor_buf->fd; buffer->ptr = t->data; @@ -2304,21 +2300,34 @@ static ggml_hexagon_session * dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt }; memset(bufs, 0, sizeof(bufs)); - buffer_init(&bufs[_Cnt - 1], dst, - DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); // Output buffer: flush CPU caches - { - uint32_t src_flags = is_src0_static ? 0 : - (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - for (size_t i = 0; i < srcs.size(); i++) { - buffer_init(&bufs[i], srcs.begin()[i], src_flags); - src_flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + const uint32_t src0_flags = is_src0_static ? 0 : + (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + buffer_init(&bufs[0], srcs.begin()[0], src0_flags); + + size_t n_bufs = 1; + for (size_t i = 1; i < srcs.size(); i++) { + auto * src = srcs.begin()[i]; + if (!src) { + continue; } + + buffer_init(&bufs[n_bufs], srcs.begin()[i], + DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | + DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Flush CPU and Invalidate DSP + n_bufs++; } - return static_cast(dst->buffer->context)->sess; + buffer_init(&bufs[n_bufs], dst, + DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); // Output buffer: flush CPU caches + n_bufs++; + + return n_bufs; +} + +static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) { + return static_cast(t->buffer->context)->sess; } static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) { @@ -2368,7 +2377,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. dspqueue_buffer bufs[3]; - auto sess = dspqueue_buffers_init(bufs, true, dst, { src0, src1 }); + dspqueue_buffers_init(bufs, true, dst, { src0, src1 }); + + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; @@ -2414,11 +2425,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag const struct ggml_tensor * src2 = op->src[2]; const struct ggml_tensor * dst = op; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto src2_buf = static_cast(src2->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1, t2; t1 = ggml_time_us(); @@ -2440,51 +2446,27 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - dspqueue_buffer bufs[4]; - memset(bufs, 0, sizeof(bufs)); - // First buffer Weights. // The content is static, there is no need to do any cache management - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = 0; - + // // Second buffer Input Activations. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - + // // Third buffer expert IDs. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - bufs[2].fd = src2_buf->fd; - bufs[2].ptr = src2->data; - bufs[2].offset = (uint8_t *) src2->data - src2_buf->base; - bufs[2].size = ggml_nbytes(src2); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - + // // Forth buffer Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - bufs[3].fd = dst_buf->fd; - bufs[3].ptr = dst->data; - bufs[3].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[3].size = ggml_nbytes(dst); - bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); + dspqueue_buffer bufs[4]; + dspqueue_buffers_init(bufs, true, dst, { src0, src1, src2 }); - // Primary DSP session from the src0 (normally weight) tensor - auto sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; @@ -2532,7 +2514,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src1 = node->src[1]; const struct ggml_tensor * dst = node; - uint64_t t1 = 0; uint64_t t2 = 0; @@ -2585,7 +2566,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. dspqueue_buffer bufs[3]; - auto sess = dspqueue_buffers_init(bufs, false, dst, { src0, src1 }); + dspqueue_buffers_init(bufs, false, dst, { src0, src1 }); + + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; @@ -2632,11 +2615,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src2 = node->src[2]; const struct ggml_tensor * dst = node; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto src2_buf = static_cast(src2->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1 = 0; uint64_t t2 = 0; @@ -2667,42 +2645,14 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { init_htp_tensor(&req.src2, src2); init_htp_tensor(&req.dst, dst); - dspqueue_buffer bufs[4]; - memset(bufs, 0, sizeof(bufs)); - // First buffer = input activations - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - // Second buffer = experts bias - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - // Third buffer = activated experts - bufs[2].fd = src2_buf->fd; - bufs[2].ptr = src2->data; - bufs[2].offset = (uint8_t *) src2->data - src2_buf->base; - bufs[2].size = ggml_nbytes(src2); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - // Forth buffer = output activations - bufs[3].fd = dst_buf->fd; - bufs[3].ptr = dst->data; - bufs[3].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[3].size = ggml_nbytes(dst); - bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); + dspqueue_buffer bufs[4]; + dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 }); - // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; @@ -2812,56 +2762,28 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - dspqueue_buffer bufs[3]; - int n_bufs = 0; - - memset(bufs, 0, sizeof(bufs)); - // First buffer = Only Operand of Unary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - auto src0_buf = static_cast(src0->buffer->context); - bufs[n_bufs].fd = src0_buf->fd; - bufs[n_bufs].ptr = src0->data; - bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[n_bufs].size = ggml_nbytes(src0); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - ++n_bufs; - - if (src1) { - // Second buffer = Second Operand of Binary op - // This is a buffer that the CPU writes and the DSP reads, so we'll - // need to flush CPU caches and invalidate DSP ones. On platforms - // with I/O coherency support the framework will automatically skip - // cache operations where possible. - auto src1_buf = static_cast(src1->buffer->context); - bufs[n_bufs].fd = src1_buf->fd; - bufs[n_bufs].ptr = src1->data; - bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[n_bufs].size = ggml_nbytes(src1); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - ++n_bufs; - } - + // + // Second buffer(nullable) = Second Operand of Binary op + // This is a buffer that the CPU writes and the DSP reads, so we'll + // need to flush CPU caches and invalidate DSP ones. On platforms + // with I/O coherency support the framework will automatically skip + // cache operations where possible. + // // Second or third buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - auto dst_buf = static_cast(dst->buffer->context); - bufs[n_bufs].fd = dst_buf->fd; - bufs[n_bufs].ptr = dst->data; - bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[n_bufs].size = ggml_nbytes(dst); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); - ++n_bufs; + dspqueue_buffer bufs[3]; + size_t n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1 }); // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; @@ -2949,70 +2871,34 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - dspqueue_buffer bufs[4]; - int n_bufs = 0; - - memset(bufs, 0, sizeof(bufs)); - // First buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - auto src0_buf = static_cast(src0->buffer->context); - bufs[n_bufs].fd = src0_buf->fd; - bufs[n_bufs].ptr = src0->data; - bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[n_bufs].size = ggml_nbytes(src0); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - ++n_bufs; - + // // Second buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - auto src1_buf = static_cast(src1->buffer->context); - bufs[n_bufs].fd = src1_buf->fd; - bufs[n_bufs].ptr = src1->data; - bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[n_bufs].size = ggml_nbytes(src1); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - ++n_bufs; - - if (src2) { - // Third buffer - // This is a buffer that the CPU writes and the DSP reads, so we'll - // need to flush CPU caches and invalidate DSP ones. On platforms - // with I/O coherency support the framework will automatically skip - // cache operations where possible. - auto src2_buf = static_cast(src2->buffer->context); - bufs[n_bufs].fd = src2_buf->fd; - bufs[n_bufs].ptr = src2->data; - bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base; - bufs[n_bufs].size = ggml_nbytes(src2); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - ++n_bufs; - } - + // + // Third buffer(nullable) + // This is a buffer that the CPU writes and the DSP reads, so we'll + // need to flush CPU caches and invalidate DSP ones. On platforms + // with I/O coherency support the framework will automatically skip + // cache operations where possible. + // // Final buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - auto dst_buf = static_cast(dst->buffer->context); - bufs[n_bufs].fd = dst_buf->fd; - bufs[n_bufs].ptr = dst->data; - bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[n_bufs].size = ggml_nbytes(dst); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); - ++n_bufs; + dspqueue_buffer bufs[4]; + size_t n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 }); // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { char dims[64 * GGML_MAX_SRC]; From 0dfae43c18ef1d0de2b907aa00e1847ad6ba64e1 Mon Sep 17 00:00:00 2001 From: chraac Date: Thu, 20 Nov 2025 22:08:55 +0800 Subject: [PATCH 06/14] refactor: optimize hex_supported_buffer function by fold expression --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index b4f1ff42cf7..dd667f86653 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -1912,20 +1912,13 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t return true; } -static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess) { - return true; -} - template -static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, - const ggml_tensor * t, - _TTensor... tensors) { - if (t && t->buffer && - (!ggml_backend_buffer_is_hexagon(t->buffer) || ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess)) { - return false; - } - - return hex_supported_buffer(sess, tensors...); +static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) { + return ([&]() -> bool { + return !tensors || !tensors->buffer || + (ggml_backend_buffer_is_hexagon(tensors->buffer) && + ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess); + }() && ...); } static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { From b886fafe0107b0450e5de88189c6ad1816e61c41 Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 21 Nov 2025 00:41:59 +0800 Subject: [PATCH 07/14] wip --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 108 ++++++------------------- 1 file changed, 23 insertions(+), 85 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index dd667f86653..1f1e621a4cd 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -240,6 +240,23 @@ struct ggml_hexagon_session { uint32_t prof_pkts; }; +static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) { + char dims[64 * GGML_MAX_SRC]; + char strides[64 * GGML_MAX_SRC]; + char types[16 * GGML_MAX_SRC]; + char buffs[64 * GGML_MAX_SRC]; + char names[64 * GGML_MAX_SRC]; + + hex_format_op_dims(dims, op); + hex_format_op_strides(strides, op); + hex_format_op_types(types, op); + hex_format_op_buffs(buffs, op); + hex_format_op_names(names, op); + + HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), + names, dims, types, strides, buffs, req_flags); +} + void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { // Bump pending flag (cleared in the session::flush once we get the responce) this->op_pending++; // atomic inc @@ -2375,20 +2392,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2462,20 +2466,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); - + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2564,20 +2555,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[16 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), - ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2648,21 +2626,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[16 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), - ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags); - + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2779,20 +2743,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); if (src1) { @@ -2894,20 +2845,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); if (src1) { From 6d330237631a0924f25c3b57d807b9849ea3cd6a Mon Sep 17 00:00:00 2001 From: chraac Date: Fri, 21 Nov 2025 23:40:38 +0800 Subject: [PATCH 08/14] refactor: simplify dspqueue_buffers_init function and its usage in hexagon operations --- ggml/src/ggml-hexagon/ggml-hexagon.cpp | 118 +++++++++++-------------- 1 file changed, 54 insertions(+), 64 deletions(-) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 1f1e621a4cd..0b4e2c3d4df 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -2292,48 +2292,20 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { h->nb[3] = t->nb[3]; } -template -static size_t dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt], - const bool is_src0_static, - const ggml_tensor * dst, - std::initializer_list srcs) { - GGML_ASSERT(_Cnt == srcs.size() + 1); - GGML_ASSERT(srcs.size() > 0); - - constexpr const auto buffer_init = [](dspqueue_buffer * buffer, const ggml_tensor * t, uint32_t flags) { - auto tensor_buf = static_cast(t->buffer->context); - buffer->fd = tensor_buf->fd; - buffer->ptr = t->data; - buffer->offset = (uint8_t *) t->data - tensor_buf->base; - buffer->size = ggml_nbytes(t); - buffer->flags = flags; - }; - - memset(bufs, 0, sizeof(bufs)); - - const uint32_t src0_flags = is_src0_static ? 0 : - (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - buffer_init(&bufs[0], srcs.begin()[0], src0_flags); - - size_t n_bufs = 1; - for (size_t i = 1; i < srcs.size(); i++) { - auto * src = srcs.begin()[i]; - if (!src) { - continue; - } - - buffer_init(&bufs[n_bufs], srcs.begin()[i], - DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Flush CPU and Invalidate DSP - n_bufs++; +static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) { + if (!t) { + return 0; } - buffer_init(&bufs[n_bufs], dst, - DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); // Output buffer: flush CPU caches - n_bufs++; - - return n_bufs; + memset(buf, 0, sizeof(*buf)); + auto tensor_buf = static_cast(t->buffer->context); + buf->fd = tensor_buf->fd; + buf->ptr = t->data; + buf->offset = (uint8_t *) t->data - tensor_buf->base; + buf->size = ggml_nbytes(t); + buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU + buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP + return 1; } static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) { @@ -2374,20 +2346,23 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } + dspqueue_buffer bufs[3]; + // First buffer Weights. // The content is static, there is no need to do any cache management - // + dspqueue_buffers_init(bufs, src0, false, false); + // Second buffer Input Activations. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - // + dspqueue_buffers_init(&bufs[1], src1, true, true); + // Third buffer Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - dspqueue_buffer bufs[3]; - dspqueue_buffers_init(bufs, true, dst, { src0, src1 }); + dspqueue_buffers_init(&bufs[2], dst, true, false); auto * sess = get_session_from_tensor(src0); @@ -2443,25 +2418,28 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } + dspqueue_buffer bufs[4]; // First buffer Weights. // The content is static, there is no need to do any cache management - // + dspqueue_buffers_init(bufs, src0, false, false); + // Second buffer Input Activations. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - // + dspqueue_buffers_init(&bufs[1], src1, true, true); + // Third buffer expert IDs. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - // + dspqueue_buffers_init(&bufs[2], src2, true, true); + // Forth buffer Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - dspqueue_buffer bufs[4]; - dspqueue_buffers_init(bufs, true, dst, { src0, src1, src2 }); + dspqueue_buffers_init(&bufs[3], dst, true, false); auto * sess = get_session_from_tensor(src0); @@ -2533,24 +2511,26 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { init_htp_tensor(&req.src1, src1); init_htp_tensor(&req.dst, dst); + dspqueue_buffer bufs[3]; // First buffer = First Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + dspqueue_buffers_init(bufs, src0, true, true); + // Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + dspqueue_buffers_init(&bufs[1], src1, true, true); + // Third buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - dspqueue_buffer bufs[3]; - dspqueue_buffers_init(bufs, false, dst, { src0, src1 }); + dspqueue_buffers_init(&bufs[2], dst, true, false); auto * sess = get_session_from_tensor(src0); @@ -2616,12 +2596,15 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { init_htp_tensor(&req.src2, src2); init_htp_tensor(&req.dst, dst); + dspqueue_buffer bufs[4]; // First buffer = input activations + dspqueue_buffers_init(bufs, src0, true, true); // Second buffer = experts bias + dspqueue_buffers_init(&bufs[1], src1, true, true); // Third buffer = activated experts + dspqueue_buffers_init(&bufs[2], src2, true, true); // Forth buffer = output activations - dspqueue_buffer bufs[4]; - dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 }); + dspqueue_buffers_init(&bufs[3], dst, true, true); auto * sess = get_session_from_tensor(src0); @@ -2719,25 +2702,28 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } + dspqueue_buffer bufs[3]; + // First buffer = Only Operand of Unary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true); + // Second buffer(nullable) = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true); + // Second or third buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - dspqueue_buffer bufs[3]; - size_t n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1 }); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false); // Primary DSP session from the src0 tensor auto * sess = get_session_from_tensor(src0); @@ -2815,31 +2801,35 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { req.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } + dspqueue_buffer bufs[4]; + // First buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true); + // Second buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true); + // Third buffer(nullable) // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - // + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true); + // Final buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - dspqueue_buffer bufs[4]; - size_t n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 }); + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false); // Primary DSP session from the src0 tensor auto * sess = get_session_from_tensor(src0); From 2df16815717dd0483643c2dbd111a11af1cd5105 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 22 Nov 2025 12:20:30 +0800 Subject: [PATCH 09/14] fix: improve nan handling at hvx_vec_fast_sigmoid_fp32_guard --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 5f94645cde3..cdefa7da389 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -773,7 +773,7 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) { HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1)); HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4)); - HVX_Vector res = hvx_vec_inverse_fp32(v5); + HVX_Vector res = hvx_vec_inverse_fp32_guard(v5); res = Q6_Vqf32_vmpy_VsfVsf(v3, res); return Q6_Vsf_equals_Vqf32(res); @@ -959,13 +959,19 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { } static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { - static const float kMaxExp = -88.02f; // log(INF) + static const float kMinExp = -88.02f; // 0 + static const float kMaxExp = 88.02f; // 1 - const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); - const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); + const HVX_Vector one = hvx_vec_splat_fp32(1.f); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + + const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); + const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); - return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); + out = Q6_V_vmux_QVV(pred_max, out, one); + return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero()); } static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { From b9cf80a2761374f7b8910393a228c23c112c3411 Mon Sep 17 00:00:00 2001 From: chraac Date: Sat, 22 Nov 2025 23:27:03 +0800 Subject: [PATCH 10/14] refactor: optimize hvx_vec_inverse_fp32_guard for better nan handling --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index cdefa7da389..d6fc9bbfc87 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -727,21 +727,15 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { } static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { - static const float kInf = INFINITY; - static const uint32_t kNanMask = 0x7fffffff; - static const uint32_t kNanMin = 0x7f800000; - - const HVX_Vector inf = hvx_vec_splat_fp32(kInf); - const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); + static const uint32_t kNanInfMask = 0x7f800000; HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); - const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); - HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask); - const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out); + const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask); + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); - return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); + return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); } #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 @@ -962,15 +956,15 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { static const float kMinExp = -88.02f; // 0 static const float kMaxExp = 88.02f; // 1 - const HVX_Vector one = hvx_vec_splat_fp32(1.f); - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + const HVX_Vector one = hvx_vec_splat_fp32(1.f); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); - out = Q6_V_vmux_QVV(pred_max, out, one); + out = Q6_V_vmux_QVV(pred_max, out, one); return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero()); } From 784fa2d95b4a7f5a9d3aa334b3c660afa9ed6b04 Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 23 Nov 2025 00:12:43 +0800 Subject: [PATCH 11/14] refactor: update hvx_vec_fast_sigmoid_fp32_guard to use adjusted exponent limits --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index d6fc9bbfc87..3a832604aa9 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -767,7 +767,7 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) { HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1)); HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4)); - HVX_Vector res = hvx_vec_inverse_fp32_guard(v5); + HVX_Vector res = hvx_vec_inverse_fp32(v5); res = Q6_Vqf32_vmpy_VsfVsf(v3, res); return Q6_Vsf_equals_Vqf32(res); @@ -953,8 +953,8 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { } static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { - static const float kMinExp = -88.02f; // 0 - static const float kMaxExp = 88.02f; // 1 + static const float kMinExp = -87.f; // 0 + static const float kMaxExp = 87.f; // 1 const HVX_Vector one = hvx_vec_splat_fp32(1.f); const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); From 2d8c6e26b1983b828c994ac79c772fed06c4289b Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 23 Nov 2025 00:19:42 +0800 Subject: [PATCH 12/14] refactor: modify hvx_vec_fast_sigmoid_fp32_guard to accept parameters for improved flexibility --- ggml/src/ggml-hexagon/htp/hvx-utils.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 3a832604aa9..125f08e6fdc 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -952,14 +952,10 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { return Q6_Vsf_equals_Vqf32(temp); } -static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { - static const float kMinExp = -87.f; // 0 - static const float kMaxExp = 87.f; // 1 - - const HVX_Vector one = hvx_vec_splat_fp32(1.f); - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); - +static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v, + HVX_Vector one, + HVX_Vector max_exp, + HVX_Vector min_exp) { const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp); @@ -977,9 +973,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + static const float kMinExp = -87.f; // 0 + static const float kMaxExp = 87.f; // 1 + + const HVX_Vector one = hvx_vec_splat_fp32(1.f); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); + v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } } From 9686ef877f33551a693539801d2ee893a0d217ef Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 23 Nov 2025 00:30:19 +0800 Subject: [PATCH 13/14] refactor: update hvx_vec_exp_fp32_guard to accept max_exp and inf parameters to save some instructions --- ggml/src/ggml-hexagon/htp/hvx-exp.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index d0735e9325e..21bf46a542f 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -16,13 +16,8 @@ #include "hvx-utils.h" #include "ops-utils.h" -static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { - static const float kInf = INFINITY; - static const float kMaxExp = 88.02f; // log(INF) - - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector inf = hvx_vec_splat_fp32(kInf); - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); +static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) { + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); HVX_Vector out = hvx_vec_exp_fp32(in_vec); @@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int HVX_Vector vec_out = Q6_V_vzero(); + static const float kInf = INFINITY; + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + if (0 == unaligned_loop) { HVX_Vector * p_vec_in1 = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; @@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); + *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); } else { - *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf); } } } else { @@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf); } } } @@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); + vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); } else { - vec_out = hvx_vec_exp_fp32_guard(in); + vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf); } hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); From 515a00ca6311f6a3f2431326271eb796fb6ebbdf Mon Sep 17 00:00:00 2001 From: chraac Date: Sun, 23 Nov 2025 00:40:17 +0800 Subject: [PATCH 14/14] refactor: move hvx_vec_inverse_fp32_guard implementation to hvx-inverse.c for better perf --- ggml/src/ggml-hexagon/htp/hvx-inverse.c | 18 +++++++++++++++--- ggml/src/ggml-hexagon/htp/hvx-utils.h | 12 ------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index 953d3e6c167..4d70634fcd4 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -16,6 +16,15 @@ #include "hvx-utils.h" #include "ops-utils.h" +static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) { + HVX_Vector out = hvx_vec_inverse_fp32(v_sf); + + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); + + return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); +} + void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; @@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n"); } + static const uint32_t kNanInfMask = 0x7f800000; + const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask); + if (0 == unaligned_loop) { HVX_Vector * p_vec_in = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); + *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask); } } @@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32_guard(in); + HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 125f08e6fdc..28b0014fb5a 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -726,18 +726,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { return Q6_Vsf_equals_Vqf32(r_qf); } -static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { - static const uint32_t kNanInfMask = 0x7f800000; - - HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - - const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask); - HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); - const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); - - return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); -} - #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267