diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index cabd301ad3572..0b4e2c3d4df16 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -240,6 +240,23 @@ struct ggml_hexagon_session { uint32_t prof_pkts; }; +static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) { + char dims[64 * GGML_MAX_SRC]; + char strides[64 * GGML_MAX_SRC]; + char types[16 * GGML_MAX_SRC]; + char buffs[64 * GGML_MAX_SRC]; + char names[64 * GGML_MAX_SRC]; + + hex_format_op_dims(dims, op); + hex_format_op_strides(strides, op); + hex_format_op_types(types, op); + hex_format_op_buffs(buffs, op); + hex_format_op_names(names, op); + + HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), + names, dims, types, strides, buffs, req_flags); +} + void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) { // Bump pending flag (cleared in the session::flush once we get the responce) this->op_pending++; // atomic inc @@ -1912,6 +1929,15 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t return true; } +template +static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) { + return ([&]() -> bool { + return !tensors || !tensors->buffer || + (ggml_backend_buffer_is_hexagon(tensors->buffer) && + ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess); + }() && ...); +} + static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -1959,16 +1985,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s } // src0 & src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2016,20 +2033,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session // src0 (weights) must be repacked and mapped to the same session // src1 & sr2 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (src2->buffer && - (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { return false; } @@ -2063,16 +2067,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2104,20 +2099,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (src2->buffer && - (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { return false; } @@ -2144,12 +2126,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses } // src0 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, dst)) { return false; } @@ -2186,16 +2163,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1 && src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2248,16 +2216,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s } // src0, src1 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1 && src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, dst)) { return false; } @@ -2312,20 +2271,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess } // src0, src1, src2 & dst must be mapped to the same session - if (src0->buffer && - (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) { - return false; - } - if (src1->buffer && - (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) { - return false; - } - if (src2 && src2->buffer && - (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) { - return false; - } - if (dst->buffer && - (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) { + if (!hex_supported_buffer(sess, src0, src1, src2, dst)) { return false; } @@ -2346,6 +2292,26 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) { h->nb[3] = t->nb[3]; } +static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) { + if (!t) { + return 0; + } + + memset(buf, 0, sizeof(*buf)); + auto tensor_buf = static_cast(t->buffer->context); + buf->fd = tensor_buf->fd; + buf->ptr = t->data; + buf->offset = (uint8_t *) t->data - tensor_buf->base; + buf->size = ggml_nbytes(t); + buf->flags = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0); // Flush CPU + buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0); // Invalidate DSP + return 1; +} + +static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) { + return static_cast(t->buffer->context)->sess; +} + static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) { auto buf = static_cast(t->buffer->context); auto sess = buf->sess; @@ -2360,10 +2326,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) const struct ggml_tensor * src1 = op->src[1]; const struct ggml_tensor * dst = op; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1, t2; t1 = ggml_time_us(); @@ -2385,55 +2347,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags) } dspqueue_buffer bufs[3]; - memset(bufs, 0, sizeof(bufs)); // First buffer Weights. // The content is static, there is no need to do any cache management - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = 0; + dspqueue_buffers_init(bufs, src0, false, false); // Second buffer Input Activations. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + dspqueue_buffers_init(&bufs[1], src1, true, true); // Third buffer Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - bufs[2].fd = dst_buf->fd; - bufs[2].ptr = dst->data; - bufs[2].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[2].size = ggml_nbytes(dst); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); + dspqueue_buffers_init(&bufs[2], dst, true, false); - // Primary DSP session from the src0 (normally weight) tensor - auto sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2463,11 +2397,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag const struct ggml_tensor * src2 = op->src[2]; const struct ggml_tensor * dst = op; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto src2_buf = static_cast(src2->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1, t2; t1 = ggml_time_us(); @@ -2490,66 +2419,32 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag } dspqueue_buffer bufs[4]; - memset(bufs, 0, sizeof(bufs)); - // First buffer Weights. // The content is static, there is no need to do any cache management - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = 0; + dspqueue_buffers_init(bufs, src0, false, false); // Second buffer Input Activations. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + dspqueue_buffers_init(&bufs[1], src1, true, true); // Third buffer expert IDs. This is a buffer that the CPU // writes and the DSP reads, so we'll need to flush CPU caches and // invalidate DSP ones. On platforms with I/O coherency support the // framework will automatically skip cache operations where possible. - bufs[2].fd = src2_buf->fd; - bufs[2].ptr = src2->data; - bufs[2].offset = (uint8_t *) src2->data - src2_buf->base; - bufs[2].size = ggml_nbytes(src2); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + dspqueue_buffers_init(&bufs[2], src2, true, true); // Forth buffer Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - bufs[3].fd = dst_buf->fd; - bufs[3].ptr = dst->data; - bufs[3].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[3].size = ggml_nbytes(dst); - bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); + dspqueue_buffers_init(&bufs[3], dst, true, false); - // Primary DSP session from the src0 (normally weight) tensor - auto sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); - + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2581,10 +2476,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src1 = node->src[1]; const struct ggml_tensor * dst = node; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1 = 0; uint64_t t2 = 0; @@ -2621,60 +2512,30 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) { init_htp_tensor(&req.dst, dst); dspqueue_buffer bufs[3]; - memset(bufs, 0, sizeof(bufs)); - // First buffer = First Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; + dspqueue_buffers_init(bufs, src0, true, true); // Second buffer = Second Operand of Binary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP + dspqueue_buffers_init(&bufs[1], src1, true, true); // Third buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - bufs[2].fd = dst_buf->fd; - bufs[2].ptr = dst->data; - bufs[2].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[2].size = ggml_nbytes(dst); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); + dspqueue_buffers_init(&bufs[2], dst, true, false); - // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[16 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), - ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2705,11 +2566,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { const struct ggml_tensor * src2 = node->src[2]; const struct ggml_tensor * dst = node; - auto src0_buf = static_cast(src0->buffer->context); - auto src1_buf = static_cast(src1->buffer->context); - auto src2_buf = static_cast(src2->buffer->context); - auto dst_buf = static_cast(dst->buffer->context); - uint64_t t1 = 0; uint64_t t2 = 0; @@ -2741,58 +2597,19 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) { init_htp_tensor(&req.dst, dst); dspqueue_buffer bufs[4]; - memset(bufs, 0, sizeof(bufs)); - // First buffer = input activations - bufs[0].fd = src0_buf->fd; - bufs[0].ptr = src0->data; - bufs[0].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[0].size = ggml_nbytes(src0); - bufs[0].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - + dspqueue_buffers_init(bufs, src0, true, true); // Second buffer = experts bias - bufs[1].fd = src1_buf->fd; - bufs[1].ptr = src1->data; - bufs[1].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[1].size = ggml_nbytes(src1); - bufs[1].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - + dspqueue_buffers_init(&bufs[1], src1, true, true); // Third buffer = activated experts - bufs[2].fd = src2_buf->fd; - bufs[2].ptr = src2->data; - bufs[2].offset = (uint8_t *) src2->data - src2_buf->base; - bufs[2].size = ggml_nbytes(src2); - bufs[2].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - + dspqueue_buffers_init(&bufs[2], src2, true, true); // Forth buffer = output activations - bufs[3].fd = dst_buf->fd; - bufs[3].ptr = dst->data; - bufs[3].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[3].size = ggml_nbytes(dst); - bufs[3].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); + dspqueue_buffers_init(&bufs[3], dst, true, true); - // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[16 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), - ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags); - + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); hex_dump_dspbuf(src1, &bufs[1]); @@ -2886,71 +2703,33 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) { } dspqueue_buffer bufs[3]; - int n_bufs = 0; - - memset(bufs, 0, sizeof(bufs)); // First buffer = Only Operand of Unary op // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - auto src0_buf = static_cast(src0->buffer->context); - bufs[n_bufs].fd = src0_buf->fd; - bufs[n_bufs].ptr = src0->data; - bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[n_bufs].size = ggml_nbytes(src0); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - ++n_bufs; + size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true); - if (src1) { - // Second buffer = Second Operand of Binary op - // This is a buffer that the CPU writes and the DSP reads, so we'll - // need to flush CPU caches and invalidate DSP ones. On platforms - // with I/O coherency support the framework will automatically skip - // cache operations where possible. - auto src1_buf = static_cast(src1->buffer->context); - bufs[n_bufs].fd = src1_buf->fd; - bufs[n_bufs].ptr = src1->data; - bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[n_bufs].size = ggml_nbytes(src1); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - ++n_bufs; - } + // Second buffer(nullable) = Second Operand of Binary op + // This is a buffer that the CPU writes and the DSP reads, so we'll + // need to flush CPU caches and invalidate DSP ones. On platforms + // with I/O coherency support the framework will automatically skip + // cache operations where possible. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true); // Second or third buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - auto dst_buf = static_cast(dst->buffer->context); - bufs[n_bufs].fd = dst_buf->fd; - bufs[n_bufs].ptr = dst->data; - bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[n_bufs].size = ggml_nbytes(dst); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); - ++n_bufs; + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false); // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); if (src1) { @@ -3023,85 +2802,40 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) { } dspqueue_buffer bufs[4]; - int n_bufs = 0; - - memset(bufs, 0, sizeof(bufs)); // First buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - auto src0_buf = static_cast(src0->buffer->context); - bufs[n_bufs].fd = src0_buf->fd; - bufs[n_bufs].ptr = src0->data; - bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base; - bufs[n_bufs].size = ggml_nbytes(src0); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP; - ++n_bufs; + size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true); // Second buffer // This is a buffer that the CPU writes and the DSP reads, so we'll // need to flush CPU caches and invalidate DSP ones. On platforms // with I/O coherency support the framework will automatically skip // cache operations where possible. - auto src1_buf = static_cast(src1->buffer->context); - bufs[n_bufs].fd = src1_buf->fd; - bufs[n_bufs].ptr = src1->data; - bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base; - bufs[n_bufs].size = ggml_nbytes(src1); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - ++n_bufs; + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true); - if (src2) { - // Third buffer - // This is a buffer that the CPU writes and the DSP reads, so we'll - // need to flush CPU caches and invalidate DSP ones. On platforms - // with I/O coherency support the framework will automatically skip - // cache operations where possible. - auto src2_buf = static_cast(src2->buffer->context); - bufs[n_bufs].fd = src2_buf->fd; - bufs[n_bufs].ptr = src2->data; - bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base; - bufs[n_bufs].size = ggml_nbytes(src2); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | // Flush CPU - DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate DSP - ++n_bufs; - } + // Third buffer(nullable) + // This is a buffer that the CPU writes and the DSP reads, so we'll + // need to flush CPU caches and invalidate DSP ones. On platforms + // with I/O coherency support the framework will automatically skip + // cache operations where possible. + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true); // Final buffer = Output Activations. We'll handle DSP // Second buffer = Output Activations. We'll handle DSP // cache maintenance in the response message but need to flush // CPU caches to ensure any previously written dirty lines are // written out before writes from the DSP start. - auto dst_buf = static_cast(dst->buffer->context); - bufs[n_bufs].fd = dst_buf->fd; - bufs[n_bufs].ptr = dst->data; - bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base; - bufs[n_bufs].size = ggml_nbytes(dst); - bufs[n_bufs].flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER); - ++n_bufs; + n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false); // Primary DSP session from the src0 tensor - ggml_hexagon_session * sess = src0_buf->sess; + auto * sess = get_session_from_tensor(src0); if (opt_verbose) { - char dims[64 * GGML_MAX_SRC]; - char strides[64 * GGML_MAX_SRC]; - char types[16 * GGML_MAX_SRC]; - char buffs[64 * GGML_MAX_SRC]; - char names[64 * GGML_MAX_SRC]; - - hex_format_op_dims(dims, op); - hex_format_op_strides(strides, op); - hex_format_op_types(types, op); - hex_format_op_buffs(buffs, op); - hex_format_op_names(names, op); - - HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op), - names, dims, types, strides, buffs, req.flags); + hex_print_op_info(op, sess, req.flags); if (opt_verbose > 1) { hex_dump_dspbuf(src0, &bufs[0]); if (src1) { diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c index d0735e9325e1c..21bf46a542f9e 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-exp.c +++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c @@ -16,13 +16,8 @@ #include "hvx-utils.h" #include "ops-utils.h" -static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) { - static const float kInf = INFINITY; - static const float kMaxExp = 88.02f; // log(INF) - - const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); - const HVX_Vector inf = hvx_vec_splat_fp32(kInf); - const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); +static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) { + const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp); HVX_Vector out = hvx_vec_exp_fp32(in_vec); @@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int HVX_Vector vec_out = Q6_V_vzero(); + static const float kInf = INFINITY; + static const float kMaxExp = 88.02f; // log(INF) + + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector inf = hvx_vec_splat_fp32(kInf); + if (0 == unaligned_loop) { HVX_Vector * p_vec_in1 = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; @@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++); - *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in); + *p_vec_out++ = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); } else { - *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++); + *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf); } } } else { @@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); } else { - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf); } } } @@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int if (true == negate) { HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in); - vec_out = hvx_vec_exp_fp32_guard(neg_vec_in); + vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf); } else { - vec_out = hvx_vec_exp_fp32_guard(in); + vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf); } hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out); diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c index 953d3e6c16709..4d70634fcd483 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c +++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c @@ -16,6 +16,15 @@ #include "hvx-utils.h" #include "ops-utils.h" +static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) { + HVX_Vector out = hvx_vec_inverse_fp32(v_sf); + + HVX_Vector masked_out = Q6_V_vand_VV(out, nan_inf_mask); + const HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out); + + return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out); +} + void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { int left_over = num_elems & (VLEN_FP32 - 1); int num_elems_whole = num_elems - left_over; @@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n"); } + static const uint32_t kNanInfMask = 0x7f800000; + const HVX_Vector nan_inf_mask = Q6_V_vsplat_R(kNanInfMask); + if (0 == unaligned_loop) { HVX_Vector * p_vec_in = (HVX_Vector *) src; HVX_Vector * p_vec_out = (HVX_Vector *) dst; #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { - *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++); + *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask); } } else { #pragma unroll(4) for (int i = 0; i < num_elems_whole; i += VLEN_FP32) { HVX_Vector in = *(HVX_UVector *) (src + i * SIZEOF_FP32); - *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in); + *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask); } } @@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const float * dstf = (float *) dst + num_elems_whole; HVX_Vector in = *(HVX_UVector *) srcf; - HVX_Vector out = hvx_vec_inverse_fp32_guard(in); + HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask); hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out); } diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 5f94645cde3b1..28b0014fb5a22 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -726,24 +726,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) { return Q6_Vsf_equals_Vqf32(r_qf); } -static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) { - static const float kInf = INFINITY; - static const uint32_t kNanMask = 0x7fffffff; - static const uint32_t kNanMin = 0x7f800000; - - const HVX_Vector inf = hvx_vec_splat_fp32(kInf); - const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf); - - HVX_Vector out = hvx_vec_inverse_fp32(v_sf); - - const HVX_Vector nan_mask = Q6_V_vsplat_R(kNanMask); - const HVX_Vector nan_min = Q6_V_vsplat_R(kNanMin); - HVX_Vector masked_out = Q6_V_vand_VV(out, nan_mask); - const HVX_VectorPred pred = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out); - - return Q6_V_vmux_QVV(pred, out, Q6_V_vzero()); -} - #define FAST_SIGMOID_LOG2F (0x3fb8aa3b) // 1.442695022 #define FAST_SIGMOID_C1 (0x3d009076) // 0.03138777 #define FAST_SIGMOID_C2 (0x3e8d74bd) // 0.276281267 @@ -958,14 +940,16 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) { return Q6_Vsf_equals_Vqf32(temp); } -static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) { - static const float kMaxExp = -88.02f; // log(INF) - - const HVX_Vector max_exp = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp)); - const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp); +static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v, + HVX_Vector one, + HVX_Vector max_exp, + HVX_Vector min_exp) { + const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v); + const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp); HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v); - return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero()); + out = Q6_V_vmux_QVV(pred_max, out, one); + return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero()); } static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) { @@ -977,9 +961,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; + static const float kMinExp = -87.f; // 0 + static const float kMaxExp = 87.f; // 1 + + const HVX_Vector one = hvx_vec_splat_fp32(1.f); + const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp); + const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp); + #pragma unroll(4) for (int i = 0; i < step_of_1; i++) { - v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]); + v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } }