From dcf367edf6b4ac5dd934253cbf016df9cc061a63 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 21:32:12 +0800 Subject: [PATCH 01/34] fix device binding at ggml_backend_qnn_buffer_type --- ggml/src/ggml-qnn/ggml-qnn.cpp | 47 ++++++++++++++-------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index c57692b867bc7..e3038d27eda0b 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -124,10 +124,6 @@ class ggml_backend_qnn_buffer_context { size_t _buffer_size = 0; }; -struct ggml_backend_qnn_buffer_type_context { - std::string name; -}; - ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } @@ -150,9 +146,9 @@ void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { } void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) { - // Do nothing here, the qnn tensor will be create along with the graph. GGML_UNUSED(buffer); GGML_UNUSED(tensor); + // TODO: we should create the qnn tensor along with the ggml tensor } void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, @@ -255,31 +251,26 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { } ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { - static ggml_backend_qnn_buffer_type_context ggml_backend_qnn_buffer_type_contexts[GGML_QNN_MAX_DEVICES]; static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; - static bool ggml_backend_qnn_buffer_type_initialized = false; auto *dev_ctx = get_device_context(dev); - if (!ggml_backend_qnn_buffer_type_initialized) { - for (size_t i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - auto &context = ggml_backend_qnn_buffer_type_contexts[i]; - context = {std::string(QNN_BACKEND_NAME) + std::to_string(i)}; - ggml_backend_qnn_buffer_types[i] = { - /* .iface = */ { - /* .get_name = */ ggml_backend_qnn_buffer_type_name, - /* .alloc_buffer = */ - ggml_backend_qnn_buffer_type_alloc_buffer, - /* .get_alignment = */ - ggml_backend_qnn_buffer_type_get_alignment, - /* .get_max_size = */ - ggml_backend_qnn_buffer_type_get_max_size, - /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes - /* .is_host = */ ggml_backend_qnn_buffer_is_host, - }, - /* .device */ dev, - /* .context = */ &context, - }; - } - ggml_backend_qnn_buffer_type_initialized = true; + if (!ggml_backend_qnn_buffer_types[dev_ctx->device].device) { + ggml_backend_qnn_buffer_types[dev_ctx->device] = { + /* .iface = */ { + /* .get_name = */ ggml_backend_qnn_buffer_type_name, + /* .alloc_buffer = */ + ggml_backend_qnn_buffer_type_alloc_buffer, + /* .get_alignment = */ + ggml_backend_qnn_buffer_type_get_alignment, + /* .get_max_size = */ + ggml_backend_qnn_buffer_type_get_max_size, + /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes + /* .is_host = */ ggml_backend_qnn_buffer_is_host, + }, + /* .device */ dev, + /* .context = */ nullptr, + }; + } else { + GGML_ASSERT(ggml_backend_qnn_buffer_types[dev_ctx->device].device == dev); } return &ggml_backend_qnn_buffer_types[dev_ctx->device]; From f39fcf5747ae50309e3d3b73fd855efac9337494 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 22:27:23 +0800 Subject: [PATCH 02/34] merge ggml_backend_qnn_buffer_context and qnn_mem_buffer --- ggml/src/ggml-qnn/buffer.hpp | 67 +++++++++++++++++++++++++++------- ggml/src/ggml-qnn/ggml-qnn.cpp | 58 +++++------------------------ ggml/src/ggml-qnn/tensor.hpp | 22 +++++------ ggml/src/ggml-qnn/utils.cpp | 8 ++++ ggml/src/ggml-qnn/utils.hpp | 1 + 5 files changed, 83 insertions(+), 73 deletions(-) diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 676e88c0454be..e741a201a3e92 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -6,11 +6,22 @@ #include "qnn-lib.hpp" namespace qnn { -class ggml_qnn_rpc_buffer { + +class qnn_buffer_interface { +public: + virtual ~qnn_buffer_interface() = default; + + virtual bool is_valid() const = 0; + virtual uint8_t *get_buffer() = 0; + virtual size_t get_size() const = 0; + virtual Qnn_MemHandle_t get_mem_handle() const = 0; +}; + +class qnn_rpc_buffer : public qnn_buffer_interface { public: - ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, - uint32_t *dimensions, Qnn_DataType_t data_type) : - _qnn_instance(qnn_instance), _size(size) { + qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, + uint32_t *dimensions, Qnn_DataType_t data_type) + : _qnn_instance(qnn_instance), _size(size) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); @@ -22,7 +33,7 @@ class ggml_qnn_rpc_buffer { QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); } - ~ggml_qnn_rpc_buffer() { + ~qnn_rpc_buffer() { if (_qnn_instance) { if (_qnn_rpc_mem_handle) { _qnn_instance->unregister_rpcmem(_qnn_rpc_mem_handle); @@ -34,11 +45,11 @@ class ggml_qnn_rpc_buffer { } } - bool is_valid() const { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } + bool is_valid() const override { return _qnn_rpc_buffer && _qnn_rpc_mem_handle; } - uint8_t *get_buffer() const { return _qnn_rpc_buffer; } - size_t get_size() const { return _size; } - Qnn_MemHandle_t get_mem_handle() const { return _qnn_rpc_mem_handle; } + uint8_t *get_buffer() override { return _qnn_rpc_buffer; } + size_t get_size() const override { return _size; } + Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } private: std::shared_ptr _qnn_instance; @@ -46,10 +57,40 @@ class ggml_qnn_rpc_buffer { uint8_t *_qnn_rpc_buffer = nullptr; Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; - ggml_qnn_rpc_buffer(const ggml_qnn_rpc_buffer &) = delete; - void operator=(const ggml_qnn_rpc_buffer &) = delete; - ggml_qnn_rpc_buffer(ggml_qnn_rpc_buffer &&) = delete; - void operator=(ggml_qnn_rpc_buffer &&) = delete; + DISABLE_COPY(qnn_rpc_buffer); + DISABLE_MOVE(qnn_rpc_buffer); +}; + +class qnn_mem_buffer : public qnn_buffer_interface { +public: + qnn_mem_buffer(size_t size) { + _buffer = reinterpret_cast(qnn::page_align_alloc(size)); + + if (!_buffer) { + QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); + return; + } + + _buffer_size = size; + } + + ~qnn_mem_buffer() { + // the free will do nothing if the _buffer is nullptr + qnn::align_free(_buffer); + } + + bool is_valid() const override { return _buffer != nullptr; } + + uint8_t *get_buffer() override { return _buffer; } + size_t get_size() const override { return _buffer_size; } + Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } + +private: + uint8_t *_buffer = nullptr; + size_t _buffer_size = 0; + + DISABLE_COPY(qnn_mem_buffer); + DISABLE_MOVE(qnn_mem_buffer); }; } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index e3038d27eda0b..d9dbebf9a0141 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -1,7 +1,5 @@ #include "ggml-qnn.h" -#include - #include #include #include @@ -88,60 +86,26 @@ static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVIC static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, "The NPU device should be an accelerator device"); -class ggml_backend_qnn_buffer_context { -public: - ggml_backend_qnn_buffer_context(QNNBackend device, std::shared_ptr instance, size_t size) - : _instance(instance), _name(QNN_BACKEND_NAME + std::to_string(device)) { - // TODO: fix this for other platforms - size_t size_page = sysconf(_SC_PAGESIZE); - - // TODO: for qnn npu, a better way here is to reuse the buffer allocated by - // qnn rpc, will save an extra copy - _buffer = qnn::align_alloc(size_page, size); - - if (!_buffer) { - QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); - return; - } - - _buffer_size = size; - } - - ~ggml_backend_qnn_buffer_context() { - // the free will do nothing if the _buffer is nullptr - qnn::align_free(_buffer); - } - - bool is_valid() const { return _buffer != nullptr; } - - void *get_buffer() { return _buffer; } - size_t get_buffer_size() { return _buffer_size; } - -private: - std::shared_ptr _instance; - std::string _name; - void *_buffer = nullptr; - size_t _buffer_size = 0; -}; - ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); } +qnn::qnn_buffer_interface *get_buffer_context(ggml_backend_buffer_t buffer) { + return reinterpret_cast(buffer->context); +} + /* * ----------------------------------------------------------------------------------------------- * qnn backend buffer object * ----------------------------------------------------------------------------------------------- */ void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - + auto *ctx = get_buffer_context(buffer); delete ctx; } void *ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - + auto *ctx = get_buffer_context(buffer); return ctx->get_buffer(); } @@ -154,7 +118,6 @@ void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tens void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) { GGML_UNUSED(buffer); - memcpy((char *)tensor->data + offset, data, size); } @@ -176,9 +139,8 @@ bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const stru } void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_qnn_buffer_context *ctx = (ggml_backend_qnn_buffer_context *)buffer->context; - - memset(ctx->get_buffer(), value, ctx->get_buffer_size()); + auto *ctx = get_buffer_context(buffer); + memset(ctx->get_buffer(), value, ctx->get_size()); } ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { @@ -204,9 +166,7 @@ const char *ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - auto *dev_ctx = get_device_context(buft->device); - ggml_backend_qnn_buffer_context *ctx = - new ggml_backend_qnn_buffer_context((QNNBackend)dev_ctx->device, dev_ctx->instance, size); + qnn::qnn_buffer_interface *ctx = new qnn::qnn_mem_buffer(size); if (!ctx->is_valid()) { return nullptr; } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 0a9a367015127..c9caadf2102e6 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -51,7 +51,7 @@ class ggml_qnn_tensor { ~ggml_qnn_tensor() { _buffer_storage.clear(); unbind(); - _qnn_rpc_buffer.reset(); + _rpc_buffer.reset(); } bool set_data_buffer(std::vector &&buffer) { @@ -162,20 +162,20 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - if (!_qnn_rpc_buffer) { - auto qnn_rpc_buffer = std::make_unique( + if (!_rpc_buffer) { + auto rpc_buffer = std::make_unique( _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); - if (!qnn_rpc_buffer->is_valid()) { + if (!rpc_buffer->is_valid()) { QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); return false; } - _qnn_rpc_buffer = std::move(qnn_rpc_buffer); + _rpc_buffer = std::move(rpc_buffer); } QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _qnn_rpc_buffer->get_mem_handle()); + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _rpc_buffer->get_mem_handle()); QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); @@ -206,8 +206,8 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); + if (_rpc_buffer) { + memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; @@ -227,8 +227,8 @@ class ggml_qnn_tensor { } if (should_use_mem_handle()) { - if (_qnn_rpc_buffer) { - memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); + if (_rpc_buffer) { + memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); } else { QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); return false; @@ -283,7 +283,7 @@ class ggml_qnn_tensor { Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; - std::unique_ptr _qnn_rpc_buffer; + std::unique_ptr _rpc_buffer; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index ebfc0372375fd..dbfb3f51fd246 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -1,6 +1,8 @@ #include "utils.hpp" +#include + #include #include "ggml-qnn.h" @@ -199,6 +201,12 @@ intptr_t align_to(size_t alignment, intptr_t offset) { uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor) { return ggml_nbytes(tensor); } +void *page_align_alloc(size_t size) { + // TODO: fix this for other platforms + const size_t alignment = sysconf(_SC_PAGESIZE); + return align_alloc(alignment, size); +} + void *align_alloc(size_t alignment, size_t size) { size_t size_aligned = size; if ((size_aligned % alignment) != 0) { diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 40dff321b970e..4a10c277691af 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -33,6 +33,7 @@ const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); uint32_t get_ggml_tensor_data_size(const ggml_tensor *tensor); +void *page_align_alloc(size_t size); void *align_alloc(size_t alignment, size_t size); void align_free(void *ptr); From 5db783fdc62dd1ea5c7e9794ccf2d1564dbf4a2f Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 29 Nov 2024 22:32:23 +0800 Subject: [PATCH 03/34] wip --- ggml/src/ggml-qnn/qnn-lib.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index c6801b7771ee9..4ffa1faceb2ab 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -195,7 +195,7 @@ class qnn_instance { int qnn_init(const QnnSaver_Config_t **saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qni_init\n"); + QNN_LOG_DEBUG("enter qnn_init\n"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { @@ -382,7 +382,7 @@ class qnn_instance { } } - QNN_LOG_DEBUG("leave qni_init\n"); + QNN_LOG_DEBUG("leave qnn_init\n"); return 0; } From ba49cf10b3be3fecc3a1e44abd7afd7e9b06752e Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 30 Nov 2024 00:04:59 +0800 Subject: [PATCH 04/34] add log --- ggml/src/ggml-qnn/backend-ops.cpp | 4 ++-- ggml/src/ggml-qnn/buffer.hpp | 10 +++++----- ggml/src/ggml-qnn/ggml-qnn.cpp | 2 ++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 30930be422496..660cd1df128d2 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -248,7 +248,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto it = graph_cache.find(graph_key); qnn::ggml_qnn_graph *graph_ptr = nullptr; if (it != graph_cache.end()) { - QNN_LOG_DEBUG("found graph %s in cache\n", graph_key.c_str()); + QNN_LOG_DEBUG("[%s]found graph %s in cache", qnn::get_backend_name(ctx->device), graph_key.c_str()); graph_ptr = it->second.get(); } else { auto graph = @@ -260,7 +260,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_ERROR("build_graph failed\n"); + QNN_LOG_ERROR("[%s]build_graph failed", qnn::get_backend_name(ctx->device)); return nullptr; } diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index e741a201a3e92..ac763cffcbceb 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -21,7 +21,7 @@ class qnn_rpc_buffer : public qnn_buffer_interface { public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) - : _qnn_instance(qnn_instance), _size(size) { + : _size(size), _qnn_instance(qnn_instance) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); @@ -52,10 +52,10 @@ class qnn_rpc_buffer : public qnn_buffer_interface { Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; } private: - std::shared_ptr _qnn_instance; size_t _size = 0; uint8_t *_qnn_rpc_buffer = nullptr; Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr; + std::shared_ptr _qnn_instance; DISABLE_COPY(qnn_rpc_buffer); DISABLE_MOVE(qnn_rpc_buffer); @@ -71,7 +71,7 @@ class qnn_mem_buffer : public qnn_buffer_interface { return; } - _buffer_size = size; + _size = size; } ~qnn_mem_buffer() { @@ -82,12 +82,12 @@ class qnn_mem_buffer : public qnn_buffer_interface { bool is_valid() const override { return _buffer != nullptr; } uint8_t *get_buffer() override { return _buffer; } - size_t get_size() const override { return _buffer_size; } + size_t get_size() const override { return _size; } Qnn_MemHandle_t get_mem_handle() const override { return nullptr; } private: + size_t _size = 0; uint8_t *_buffer = nullptr; - size_t _buffer_size = 0; DISABLE_COPY(qnn_mem_buffer); DISABLE_MOVE(qnn_mem_buffer); diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index d9dbebf9a0141..c12239ba7c7b1 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -171,6 +171,8 @@ ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(ggml_backend_buf return nullptr; } + QNN_LOG_DEBUG("[%s]alloc buffer: %p, size: %ld", qnn::get_backend_name(get_device_context(buft->device)->device), + ctx->get_buffer(), size); return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size); } From c55a9e2c0c1ba985840350b64b095b9c4eb4e50c Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 30 Nov 2024 00:32:40 +0800 Subject: [PATCH 05/34] wip --- ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index c12239ba7c7b1..aadbb51a7660d 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -244,7 +244,7 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *node = cgraph->nodes[i]; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || - node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } bool ok = qnn::ggml_qnn_forward(device_ctx, node); From 19e98073a4d3f3df33a63f8d01c49345df072697 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 30 Nov 2024 11:36:36 +0800 Subject: [PATCH 06/34] add qnn_buffer_ptr --- ggml/src/ggml-qnn/buffer.hpp | 3 +++ ggml/src/ggml-qnn/tensor.hpp | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index ac763cffcbceb..806a450956bf6 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include "logger.hpp" #include "qnn-lib.hpp" @@ -17,6 +18,8 @@ class qnn_buffer_interface { virtual Qnn_MemHandle_t get_mem_handle() const = 0; }; +using qnn_buffer_ptr = std::shared_ptr; + class qnn_rpc_buffer : public qnn_buffer_interface { public: qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c9caadf2102e6..5d743a8dae997 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -163,11 +163,11 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (!_rpc_buffer) { - auto rpc_buffer = std::make_unique( + auto rpc_buffer = std::make_shared( _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!rpc_buffer->is_valid()) { - QNN_LOG_WARN("[%s]alloc rpc mem failed", _tensor_name.c_str()); + QNN_LOG_WARN("[%s][%s]alloc rpc mem failed", get_backend_name(_device), _tensor_name.c_str()); return false; } @@ -175,8 +175,16 @@ class ggml_qnn_tensor { } QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_MEMHANDLE); - QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, _rpc_buffer->get_mem_handle()); - QNN_LOG_DEBUG("[%s]use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); + auto mem_handle = _rpc_buffer->get_mem_handle(); + if (!mem_handle) { + QNN_LOG_WARN("[%s][%s]can't find rpcmem from qnn mem handle", get_backend_name(_device), + _tensor_name.c_str()); + return false; + } + + QNN_TENSOR_SET_MEM_HANDLE(_qnn_tensor, mem_handle); + QNN_LOG_DEBUG("[%s][%s]use mem handle %p", get_backend_name(_device), _tensor_name.c_str(), + QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {buffer, (uint32_t)buffer_size}; @@ -283,7 +291,7 @@ class ggml_qnn_tensor { Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; - std::unique_ptr _rpc_buffer; + qnn_buffer_ptr _rpc_buffer; DISABLE_COPY(ggml_qnn_tensor); DISABLE_MOVE(ggml_qnn_tensor); From 35f7e1cc6f2d5a1a136ff82afeef0673229c302a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 30 Nov 2024 12:03:20 +0800 Subject: [PATCH 07/34] remove tailing `\n` at log --- ggml/src/ggml-qnn/backend-ops.cpp | 12 +-- ggml/src/ggml-qnn/buffer.hpp | 8 +- ggml/src/ggml-qnn/graph.hpp | 14 +-- ggml/src/ggml-qnn/op-config.cpp | 24 ++--- ggml/src/ggml-qnn/qnn-lib.hpp | 145 +++++++++++++++--------------- ggml/src/ggml-qnn/tensor.hpp | 4 +- 6 files changed, 103 insertions(+), 104 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 660cd1df128d2..0e6e46c2ec9a8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -15,13 +15,13 @@ namespace { bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src, ggml_tensor *dst) { if (!ctx || !src || !dst) { - QNN_LOG_WARN("invalid params\n"); + QNN_LOG_WARN("invalid params"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance\n"); + QNN_LOG_WARN("invalid instance"); return false; } @@ -31,13 +31,13 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) { if (!ctx || !src0 || !src1 || !dst) { - QNN_LOG_WARN("invalid params\n"); + QNN_LOG_WARN("invalid params"); return false; } auto instance = ctx->instance; if (!instance) { - QNN_LOG_WARN("invalid instance\n"); + QNN_LOG_WARN("invalid instance"); return false; } @@ -45,7 +45,7 @@ bool qnn_is_valid_params(ggml_backend_qnn_device_context *ctx, const ggml_tensor } void print_ggml_tensor(const ggml_tensor *tensor) { - QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld\n", tensor->name, ggml_type_name(tensor->type), + QNN_LOG_DEBUG("%s: type:%s ne: %ldx%ldx%ldx%ld, nb: %ldx%ldx%ldx%ld", tensor->name, ggml_type_name(tensor->type), (long)tensor->ne[0], (long)tensor->ne[1], (long)tensor->ne[2], (long)tensor->ne[3], (long)tensor->nb[0], (long)tensor->nb[1], (long)tensor->nb[2], (long)tensor->nb[3]); } @@ -96,7 +96,7 @@ template bool execute_graph(qnn::ggml_qnn_graph *graph, const std::array &inputs, ggml_tensor *output) { if (!graph->execute(to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<1>({output}))) { - QNN_LOG_WARN("execute failed\n"); + QNN_LOG_WARN("execute failed"); return false; } diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 806a450956bf6..5a0e86121da5d 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -26,15 +26,15 @@ class qnn_rpc_buffer : public qnn_buffer_interface { uint32_t *dimensions, Qnn_DataType_t data_type) : _size(size), _qnn_instance(qnn_instance) { - _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); + _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *))); _qnn_rpc_mem_handle = qnn_instance->register_rpcmem(_qnn_rpc_buffer, rank, dimensions, data_type); if (!_qnn_rpc_buffer || !_qnn_rpc_mem_handle) { - QNN_LOG_WARN("register rpc mem failure\n"); + QNN_LOG_WARN("register rpc mem failure"); // let the destructor free the buffer return; } - QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d\n", _qnn_rpc_buffer, (int)size); + QNN_LOG_DEBUG("alloc rpcmem(%p) successfully, size %d", _qnn_rpc_buffer, (int)size); } ~qnn_rpc_buffer() { if (_qnn_instance) { @@ -70,7 +70,7 @@ class qnn_mem_buffer : public qnn_buffer_interface { _buffer = reinterpret_cast(qnn::page_align_alloc(size)); if (!_buffer) { - QNN_LOG_WARN("failed to allocate %.2f MiB\n", float(size / (1 << 20))); + QNN_LOG_WARN("failed to allocate %.2f MiB", float(size / (1 << 20))); return; } diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 1b0dcd78faa17..1806f41126f3c 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -64,12 +64,12 @@ class ggml_qnn_graph { } if (error != QNN_SUCCESS) { - QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s\n", get_backend_name(device), - graph_name.c_str(), get_qnn_error_string(error)); + QNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s", get_backend_name(device), graph_name.c_str(), + get_qnn_error_string(error)); return; } - QNN_LOG_INFO("[%s][%s]create succeed\n", get_backend_name(device), graph_name.c_str()); + QNN_LOG_INFO("[%s][%s]create succeed", get_backend_name(device), graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } @@ -80,7 +80,7 @@ class ggml_qnn_graph { const ggml_tensor_array_t &tensor_outputs) { GGML_ASSERT(op_constructor); if (!is_valid()) { - QNN_LOG_ERROR("Invalid graph\n"); + QNN_LOG_ERROR("Invalid graph"); return false; } @@ -92,7 +92,7 @@ class ggml_qnn_graph { } if (!_op_config->add_op_to_graph(_graph_handle)) { - QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); + QNN_LOG_ERROR("[%s]add nodes failed", _graph_name.c_str()); return false; } @@ -109,12 +109,12 @@ class ggml_qnn_graph { bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { if (!_op_config->bind_input_tensors(tensor_inputs)) { - QNN_LOG_ERROR("[%s][%s]bind input tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind input tensors failed", get_backend_name(_device), _graph_name.c_str()); return false; } if (!_op_config->bind_output_tensors(tensor_outputs)) { - QNN_LOG_ERROR("[%s][%s]bind output tensors failed\n", get_backend_name(_device), _graph_name.c_str()); + QNN_LOG_ERROR("[%s][%s]bind output tensors failed", get_backend_name(_device), _graph_name.c_str()); return false; } diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index df70d548a44e0..7ee5eb38a17a8 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -82,7 +82,7 @@ bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::qnn_tensor_ for (size_t i = 0; i < ggml_tensors.size(); i++) { auto *ggml_tensor = ggml_tensors[i]; if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + QNN_LOG_ERROR("bind tensor %s failed", ggml_get_name(ggml_tensor)); return false; } @@ -162,12 +162,12 @@ bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qn GGML_ASSERT(data_size > 0); if (!param_tensor->bind_buffer(const_cast(data), data_size)) { - QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + QNN_LOG_ERROR("parameter tensor bind_buffer failed"); return false; } if (!param_tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed"); return false; } @@ -185,26 +185,26 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); - QNN_LOG_DEBUG("[%s]add to graph start\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]add to graph start", _name.c_str()); for (size_t i = 0; i < _tensor_inputs.size(); i++) { auto tensor = _tensor_inputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]input tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]input tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); } for (size_t i = 0; i < _tensor_outputs.size(); i++) { auto tensor = _tensor_outputs[i]; if (!tensor->alloc_qnn_tensor_id()) { - QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed", _name.c_str()); return false; } - QNN_LOG_DEBUG("[%s]output tensor id: %d\n", _name.c_str(), tensor->get_qnn_tensor_id()); + QNN_LOG_DEBUG("[%s]output tensor id: %d", _name.c_str(), tensor->get_qnn_tensor_id()); _qnn_tensor_outputs[i] = tensor->get_qnn_tensor(); } @@ -215,7 +215,7 @@ bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { return false; } - QNN_LOG_DEBUG("[%s]added to graph succeed\n", _name.c_str()); + QNN_LOG_DEBUG("[%s]added to graph succeed", _name.c_str()); return true; } @@ -301,7 +301,7 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph // create convert nodes qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { - QNN_LOG_ERROR("create convert nodes failed\n"); + QNN_LOG_ERROR("create convert nodes failed"); return false; } @@ -387,7 +387,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap // create tensors for convert node auto tensor_type = get_tensor_type(tensor_inputs); - QNN_LOG_DEBUG("input tensor type: %s\n", qnn_datatype_to_string(tensor_type)); + QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type)); _input_converts.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); ++i) { @@ -580,7 +580,7 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor return [](const std::string &instance_name, std::shared_ptr qnn_instance) -> std::unique_ptr { - QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; } else if (op_name == QNN_OP_TRANSPOSE) { diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index 4ffa1faceb2ab..454c0c6aa32c5 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -195,21 +195,21 @@ class qnn_instance { int qnn_init(const QnnSaver_Config_t **saver_config) { BackendIdType backend_id = QNN_BACKEND_ID_NULL; - QNN_LOG_DEBUG("enter qnn_init\n"); + QNN_LOG_DEBUG("enter qnn_init"); std::lock_guard lock(_init_mutex); if (load_system() != 0) { - QNN_LOG_WARN("can not load QNN system lib, pls check why?\n"); + QNN_LOG_WARN("can not load QNN system lib, pls check why?"); return 1; } else { - QNN_LOG_DEBUG("load QNN system lib successfully\n"); + QNN_LOG_DEBUG("load QNN system lib successfully"); } std::string backend_lib_path = _lib_path + _backend_name; if (_lib_path_to_backend_id.count(backend_lib_path) == 0) { int is_load_ok = load_backend(backend_lib_path, saver_config); if (is_load_ok != 0) { - QNN_LOG_WARN("failed to load QNN backend\n"); + QNN_LOG_WARN("failed to load QNN backend"); return 2; } } @@ -218,7 +218,7 @@ class qnn_instance { if (_loaded_backend.count(backend_id) == 0 || _loaded_lib_handle.count(backend_id) == 0) { QNN_LOG_WARN( "library %s is loaded but loaded backend count=%zu, " - "loaded lib_handle count=%zu\n", + "loaded lib_handle count=%zu", backend_lib_path.c_str(), _loaded_backend.count(backend_id), _loaded_lib_handle.count(backend_id)); return 3; } @@ -227,28 +227,28 @@ class qnn_instance { _qnn_interface->qnn_log_create(qnn::sdk_logcallback, _qnn_log_level, &_qnn_log_handle); if (nullptr == _qnn_log_handle) { // NPU backend not work on Qualcomm SoC equipped low-end phone - QNN_LOG_WARN("why failed to initialize qnn log\n"); + QNN_LOG_WARN("why failed to initialize qnn log"); return 4; } else { - QNN_LOG_DEBUG("initialize qnn log successfully\n"); + QNN_LOG_DEBUG("initialize qnn log successfully"); } std::vector temp_backend_config; _qnn_interface->qnn_backend_create( _qnn_log_handle, temp_backend_config.empty() ? nullptr : temp_backend_config.data(), &_qnn_backend_handle); if (nullptr == _qnn_backend_handle) { - QNN_LOG_WARN("why failed to initialize qnn backend\n"); + QNN_LOG_WARN("why failed to initialize qnn backend"); return 5; } else { - QNN_LOG_DEBUG("initialize qnn backend successfully\n"); + QNN_LOG_DEBUG("initialize qnn backend successfully"); } Qnn_ErrorHandle_t qnn_status = _qnn_interface->qnn_property_has_capability(QNN_PROPERTY_GROUP_DEVICE); if (QNN_PROPERTY_NOT_SUPPORTED == qnn_status) { - QNN_LOG_WARN("device property is not supported\n"); + QNN_LOG_WARN("device property is not supported"); } if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnn_status) { - QNN_LOG_WARN("device property is not known to backend\n"); + QNN_LOG_WARN("device property is not known to backend"); } qnn_status = QNN_SUCCESS; @@ -294,9 +294,9 @@ class qnn_instance { qnn_status = _qnn_interface->qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle); } if (QNN_SUCCESS != qnn_status && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnn_status) { - QNN_LOG_WARN("failed to create QNN device\n"); + QNN_LOG_WARN("failed to create QNN device"); } else { - QNN_LOG_INFO("create QNN device successfully\n"); + QNN_LOG_INFO("create QNN device successfully"); } if (_profile_level != sdk_profile_level::profile_off) { @@ -306,19 +306,19 @@ class qnn_instance { if (QNN_PROFILE_NO_ERROR != _qnn_interface->qnn_profile_create(_qnn_backend_handle, profile_level, &_qnn_profile_handle)) { - QNN_LOG_WARN("unable to create profile handle in the backend\n"); + QNN_LOG_WARN("unable to create profile handle in the backend"); return 6; } else { - QNN_LOG_DEBUG("initialize qnn profile successfully\n"); + QNN_LOG_DEBUG("initialize qnn profile successfully"); } } _rpc_lib_handle = dl_load("libcdsprpc.so"); if (nullptr == _rpc_lib_handle) { - QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dl_error()); + QNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s", dl_error()); return 8; } else { - QNN_LOG_DEBUG("load rpcmem lib successfully\n"); + QNN_LOG_DEBUG("load rpcmem lib successfully"); set_rpcmem_initialized(true); } _pfn_rpc_mem_init = reinterpret_cast(dl_sym(_rpc_lib_handle, "rpcmem_init")); @@ -343,10 +343,10 @@ class qnn_instance { */ _qnn_interface->qnn_context_create(_qnn_backend_handle, _qnn_device_handle, nullptr, &_qnn_context_handle); if (nullptr == _qnn_context_handle) { - QNN_LOG_WARN("why failed to initialize qnn context\n"); + QNN_LOG_WARN("why failed to initialize qnn context"); return 10; } else { - QNN_LOG_DEBUG("initialize qnn context successfully\n"); + QNN_LOG_DEBUG("initialize qnn context successfully"); } if (_backend_name.find("Htp") != _backend_name.npos) { @@ -359,7 +359,7 @@ class qnn_instance { for (size_t idx = 0; idx < probe_counts; idx++) { rpc_buffer = static_cast(alloc_rpcmem(probe_slots[idx] * size_in_mb, sizeof(void *))); if (!rpc_buffer) { - QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno)); + QNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s", probe_slots[idx], strerror(errno)); break; } else { candidate_size = probe_slots[idx]; @@ -369,7 +369,7 @@ class qnn_instance { } _rpcmem_capacity = std::max(candidate_size, _rpcmem_capacity); - QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB\n", _rpcmem_capacity); + QNN_LOG_INFO("capacity of QNN rpc ion memory is about %d MB", _rpcmem_capacity); if (0 != init_htp_perfinfra()) { QNN_LOG_WARN("initialize HTP performance failure"); @@ -382,7 +382,7 @@ class qnn_instance { } } - QNN_LOG_DEBUG("leave qnn_init\n"); + QNN_LOG_DEBUG("leave qnn_init"); return 0; } @@ -395,9 +395,9 @@ class qnn_instance { _pfn_rpc_mem_deinit(); if (dl_unload(_rpc_lib_handle) != 0) { - QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dl_error()); + QNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s", dl_error()); } else { - QNN_LOG_DEBUG("succeed to close rpcmem lib\n"); + QNN_LOG_DEBUG("succeed to close rpcmem lib"); } if (_backend_name.find("Htp") != _backend_name.npos) { @@ -407,7 +407,7 @@ class qnn_instance { if (nullptr != _qnn_context_handle) { error = _qnn_interface->qnn_context_free(_qnn_context_handle, _qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_context_handle = nullptr; @@ -416,7 +416,7 @@ class qnn_instance { if (nullptr != _qnn_profile_handle) { error = _qnn_interface->qnn_profile_free(_qnn_profile_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_profile_handle = nullptr; @@ -425,7 +425,7 @@ class qnn_instance { if (nullptr != _qnn_device_handle) { error = _qnn_interface->qnn_device_free(_qnn_device_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_device_handle = nullptr; @@ -434,7 +434,7 @@ class qnn_instance { if (nullptr != _qnn_backend_handle) { error = _qnn_interface->qnn_backend_free(_qnn_backend_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_backend_handle = nullptr; @@ -443,7 +443,7 @@ class qnn_instance { if (nullptr != _qnn_log_handle) { error = _qnn_interface->qnn_log_free(_qnn_log_handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n", _qnn_interface->get_backend_id(), + QNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d", _qnn_interface->get_backend_id(), QNN_GET_ERROR_CODE(error)); } _qnn_log_handle = nullptr; @@ -458,7 +458,7 @@ class qnn_instance { std::shared_ptr get_qnn_interface() { if (!_qnn_interface) { - QNN_LOG_WARN("pls check why _qnn_interface is not loaded\n"); + QNN_LOG_WARN("pls check why _qnn_interface is not loaded"); } return _qnn_interface; } @@ -479,10 +479,10 @@ class qnn_instance { QnnDevice_Infrastructure_t device_infra = nullptr; int error = _qnn_interface->qnn_device_get_infrastructure(&device_infra); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get qnn device infra\n"); + QNN_LOG_WARN("failed to get qnn device infra"); return 1; } else { - QNN_LOG_INFO("HTP backend perf_infrastructure creation ok\n"); + QNN_LOG_INFO("HTP backend perf_infrastructure creation ok"); } QnnHtpDevice_Infrastructure_t *htp_infra = static_cast(device_infra); @@ -494,7 +494,7 @@ class qnn_instance { if (htp_infra->infraType != QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF) { QNN_LOG_INFO("HTP infra type = %d, which is not perf infra type", htp_infra->infraType); } else { - QNN_LOG_INFO("HTP infra type = %d, which is perf infra type\n", htp_infra->infraType); + QNN_LOG_INFO("HTP infra type = %d, which is perf infra type", htp_infra->infraType); } _qnn_htp_perfinfra = htp_perfinfra; _qnn_power_configid = power_configid; @@ -520,12 +520,12 @@ class qnn_instance { nullptr}; Qnn_ErrorHandle_t qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp perf failed\n"); + QNN_LOG_WARN("set htp perf failed"); } else { - QNN_LOG_DEBUG("set htp perf ok\n"); + QNN_LOG_DEBUG("set htp perf ok"); } } else { - QNN_LOG_WARN("can't set htp perf\n"); + QNN_LOG_WARN("can't set htp perf"); } return 0; @@ -533,7 +533,7 @@ class qnn_instance { int set_high_performance_mode() { if (nullptr == _qnn_htp_perfinfra) { - QNN_LOG_WARN("perf intra is null\n"); + QNN_LOG_WARN("perf intra is null"); return 1; } @@ -566,9 +566,9 @@ class qnn_instance { Qnn_ErrorHandle_t qnn_status = QNN_SUCCESS; qnn_status = _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs); if (qnn_status != QNN_SUCCESS) { - QNN_LOG_WARN("set htp high performance mode failed\n"); + QNN_LOG_WARN("set htp high performance mode failed"); } else { - QNN_LOG_DEBUG("set htp high performance mode ok\n"); + QNN_LOG_DEBUG("set htp high performance mode ok"); } return 0; @@ -584,21 +584,21 @@ class qnn_instance { void *alloc_rpcmem(size_t bytes, size_t alignment) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); return nullptr; } auto allocate_bytes = static_cast(bytes + alignment); void *buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (int)allocate_bytes); if (!buf) { - QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB\n", (int)(allocate_bytes / (1 << 20))); + QNN_LOG_WARN("failed to allocate rpc memory, size: %d MB", (int)(allocate_bytes / (1 << 20))); return nullptr; } auto aligned_buf = reinterpret_cast(qnn::align_to(alignment, reinterpret_cast(buf))); bool status = _rpcmem_store_map.insert(std::pair(aligned_buf, buf)).second; if (!status) { - QNN_LOG_WARN("failed to allocate rpc memory\n"); + QNN_LOG_WARN("failed to allocate rpc memory"); _pfn_rpc_mem_free(buf); } @@ -607,9 +607,9 @@ class qnn_instance { void free_rpcmem(void *buf) { if (!_rpcmem_initialized) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); } else if (_rpcmem_store_map.count(buf) == 0) { - QNN_LOG_WARN("no allocated tensor\n"); + QNN_LOG_WARN("no allocated tensor"); } else { _pfn_rpc_mem_free(_rpcmem_store_map[buf]); _rpcmem_store_map.erase(buf); @@ -619,7 +619,7 @@ class qnn_instance { int32_t rpcmem_to_fd(void *buf) { int32_t mem_fd = -1; if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); } else { mem_fd = _pfn_rpc_mem_to_fd(buf); } @@ -629,52 +629,51 @@ class qnn_instance { Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { - QNN_LOG_WARN("invalid param\n"); + QNN_LOG_WARN("invalid param"); return nullptr; } if (!is_rpcmem_initialized()) { - QNN_LOG_WARN("rpc memory not initialized\n"); + QNN_LOG_WARN("rpc memory not initialized"); return nullptr; } if (is_rpcmem_registered(p_data)) { - QNN_LOG_WARN("rpc memory already registered\n"); + QNN_LOG_WARN("rpc memory already registered"); return _qnn_rpc_buffer_to_handles[p_data]; } auto mem_fd = rpcmem_to_fd(p_data); if (mem_fd == -1) { - QNN_LOG_WARN("failed to get file descriptor\n"); + QNN_LOG_WARN("failed to get file descriptor"); return nullptr; } - QNN_LOG_DEBUG("mem_fd %d\n", mem_fd); + QNN_LOG_DEBUG("mem_fd %d", mem_fd); Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}}; Qnn_MemHandle_t handle = nullptr; auto error = _qnn_interface->qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), - strerror(error)); + QNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error)); return nullptr; } _qnn_rpc_buffer_to_handles.insert({p_data, handle}); - QNN_LOG_DEBUG("successfully register shared memory handler: %p\n", handle); + QNN_LOG_DEBUG("successfully register shared memory handler: %p", handle); return handle; } void unregister_rpcmem(Qnn_MemHandle_t mem_handle) { Qnn_ErrorHandle_t error = _qnn_interface->qnn_mem_de_register(&mem_handle, 1); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error)); } auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(), [mem_handle](const auto &kv) { return kv.second == mem_handle; }); if (it == _qnn_rpc_buffer_to_handles.end()) { - QNN_LOG_WARN("failed to find shared memory handler: %p\n", mem_handle); + QNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle); return; } @@ -691,18 +690,18 @@ class qnn_instance { Qnn_ErrorHandle_t error = QNN_SUCCESS; std::string system_lib_path = _lib_path + "libQnnSystem.so"; - QNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str()); + QNN_LOG_DEBUG("system_lib_path:%s", system_lib_path.c_str()); auto system_lib_handle = dl_load(system_lib_path); if (!system_lib_handle) { - QNN_LOG_WARN("can not load QNN library %s, error: %s\n", system_lib_path.c_str(), dl_error()); + QNN_LOG_WARN("can not load QNN library %s, error: %s", system_lib_path.c_str(), dl_error()); return 1; } auto *get_providers = dl_sym_typed( system_lib_handle, "QnnSystemInterface_getProviders"); if (!get_providers) { - QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dl_error()); + QNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s", dl_error()); return 2; } @@ -710,17 +709,17 @@ class qnn_instance { const QnnSystemInterface_t **provider_list = nullptr; error = get_providers(&provider_list, &num_providers); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error)); + QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } if (num_providers != _required_num_providers) { - QNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers); + QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("can not get providers\n"); + QNN_LOG_WARN("can not get providers"); return 5; } @@ -735,15 +734,15 @@ class qnn_instance { } } if (!found_valid_system_interface) { - QNN_LOG_WARN("unable to find a valid qnn system interface\n"); + QNN_LOG_WARN("unable to find a valid qnn system interface"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn system interface\n"); + QNN_LOG_DEBUG("find a valid qnn system interface"); } auto qnn_sys_interface = std::make_shared(*provider_list[0], system_lib_handle); if (!qnn_sys_interface->is_valid()) { - QNN_LOG_WARN("failed to create QNN system interface\n"); + QNN_LOG_WARN("failed to create QNN system interface"); return 7; } @@ -753,7 +752,7 @@ class qnn_instance { int load_backend(std::string &lib_path, const QnnSaver_Config_t ** /*saver_config*/) { Qnn_ErrorHandle_t error = QNN_SUCCESS; - QNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str()); + QNN_LOG_DEBUG("lib_path:%s", lib_path.c_str()); auto lib_handle = dl_load(lib_path.c_str()); if (!lib_handle) { @@ -775,14 +774,14 @@ class qnn_instance { QNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error)); return 3; } - QNN_LOG_DEBUG("num_providers=%d\n", num_providers); + QNN_LOG_DEBUG("num_providers=%d", num_providers); if (num_providers != _required_num_providers) { QNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers); return 4; } if (!provider_list) { - QNN_LOG_WARN("failed to get qnn interface providers\n"); + QNN_LOG_WARN("failed to get qnn interface providers"); return 5; } bool found_valid_interface = false; @@ -797,23 +796,23 @@ class qnn_instance { } if (!found_valid_interface) { - QNN_LOG_WARN("unable to find a valid qnn interface\n"); + QNN_LOG_WARN("unable to find a valid qnn interface"); return 6; } else { - QNN_LOG_DEBUG("find a valid qnn interface\n"); + QNN_LOG_DEBUG("find a valid qnn interface"); } BackendIdType backend_id = provider_list[0]->backendId; _lib_path_to_backend_id[lib_path] = backend_id; if (_loaded_backend.count(backend_id) > 0) { - QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n", lib_path.c_str(), backend_id); + QNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists", lib_path.c_str(), backend_id); } _loaded_backend[backend_id] = provider_list[0]; if (_loaded_lib_handle.count(backend_id) > 0) { - QNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]); + QNN_LOG_WARN("closing %p", _loaded_lib_handle[backend_id]); int dlclose_error = dl_unload(_loaded_lib_handle[backend_id]); if (dlclose_error != 0) { - QNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dl_error()); + QNN_LOG_WARN("fail to close %p with error %s", _loaded_lib_handle[backend_id], dl_error()); } } _loaded_lib_handle[backend_id] = lib_handle; @@ -827,7 +826,7 @@ class qnn_instance { for (auto &it : _loaded_lib_handle) { dlclose_error = dl_unload(it.second); if (dlclose_error != 0) { - QNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dl_error()); + QNN_LOG_WARN("failed to close QNN backend %d, error %s", it.first, dl_error()); } } diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 5d743a8dae997..25502c38d0154 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -73,7 +73,7 @@ class ggml_qnn_tensor { auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { - QNN_LOG_WARN("[%s]allocate id failed , error: %d\n", _tensor_name.c_str(), error); + QNN_LOG_WARN("[%s]allocate id failed , error: %d", _tensor_name.c_str(), error); return false; } @@ -217,7 +217,7 @@ class ggml_qnn_tensor { if (_rpc_buffer) { memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); + QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); return false; } } From 7dcdc720f75bbe0d0c790bb7cbc1a3df8f31a2cb Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 30 Nov 2024 20:46:29 +0800 Subject: [PATCH 08/34] add log --- ggml/src/ggml-qnn/ggml-qnn.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index aadbb51a7660d..4cfcd1ca978ac 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -417,6 +417,12 @@ bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_ return ggml_backend_buft_is_host(buft); } +bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor *op) { + auto *device_ctx = get_device_context(dev); + QNN_LOG_DEBUG("[%s][%s]offload op", qnn::get_backend_name(device_ctx->device), ggml_op_name(op->op)); + return false; +} + const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .get_name = */ ggml_backend_qnn_device_get_name, /* .get_description = */ ggml_backend_qnn_device_get_description, @@ -429,7 +435,7 @@ const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_ptr, /* .supports_op = */ ggml_backend_qnn_device_supports_op, /* .supports_buft = */ ggml_backend_qnn_device_supports_buft, - /* .offload_op = */ nullptr, + /* .offload_op = */ ggml_backend_qnn_device_offload_op, /* .event_new = */ nullptr, /* .event_free = */ nullptr, /* .event_synchronize = */ nullptr, From 7da6ea9bcb414cba1a53bfad4167745ad0edf611 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 1 Dec 2024 12:34:03 +0800 Subject: [PATCH 09/34] enable GGML_OP_NONE --- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/ggml-qnn.cpp | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 0e6e46c2ec9a8..4c2505d4f482c 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -332,7 +332,7 @@ bool qnn_binary_nop_impl(ggml_backend_qnn_device_context *ctx, ggml_tensor *src0 } constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { - nullptr, // GGML_OP_NONE + qnn_unary_nop_impl, // GGML_OP_NONE nullptr, // GGML_OP_DUP nullptr, // GGML_OP_ADD nullptr, // GGML_OP_ADD1 diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 4cfcd1ca978ac..0defff42d622c 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -143,7 +143,7 @@ void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) memset(ctx->get_buffer(), value, ctx->get_size()); } -ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { +constexpr const ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = { /* .free_buffer = */ ggml_backend_qnn_buffer_free_buffer, /* .get_base = */ ggml_backend_qnn_buffer_get_base, /* .init_tensor = */ ggml_backend_qnn_buffer_init_tensor, @@ -243,8 +243,7 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * auto *device_ctx = get_device_context(backend->device); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || - node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node)) { continue; } bool ok = qnn::ggml_qnn_forward(device_ctx, node); @@ -256,7 +255,7 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * return result; } -ggml_backend_i ggml_backend_qnn_interface = { +constexpr const ggml_backend_i ggml_backend_qnn_interface = { /* .get_name = */ ggml_backend_qnn_name, /* .free = */ ggml_backend_qnn_free, /* .set_tensor_async = */ nullptr, From 7c925df8f9de4b5bb488c91f90eeb34e5a9eb0d1 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 1 Dec 2024 15:56:31 +0800 Subject: [PATCH 10/34] wip --- ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 0defff42d622c..dfe45c36ea26f 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -185,7 +185,7 @@ size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buf size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { GGML_UNUSED(buft); // TODO: get the max size from device - return 1024 * 1024 * 1024; + return 1024L * 1024 * 1024; } bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) { From 3429e3a958c33796c33d11b863679189186c6bdf Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 1 Dec 2024 23:18:49 +0800 Subject: [PATCH 11/34] wip --- ggml/src/ggml-qnn/backend-ops.cpp | 37 ++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 4c2505d4f482c..ea9671a95f4a8 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -543,22 +543,39 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } -#ifndef NDEBUG - auto *type_name = ggml_get_type_traits(tensor->type)->type_name; -#endif switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: case GGML_TYPE_Q8_0: case GGML_TYPE_Q4_0: if (!(ctx->supported_types & (1 << tensor->type))) { - QNN_LOG_DEBUG("unsupported data type %s for backend %s, supported_types: 0x%x", type_name, - qnn::get_backend_name(ctx->device), ctx->supported_types); + QNN_LOG_DEBUG("[%s]unsupported data type %s, supported_types: 0x%x", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type), ctx->supported_types); + return false; + } + + if (tensor->ne[0] != ggml_type_size(tensor->type)) { + QNN_LOG_DEBUG("[%s]unsupported data type %s, ne0: %ld, type_size: %ld", + qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), (long)tensor->ne[0], + (long)ggml_type_size(tensor->type)); return false; } + + for (size_t i = 1; i < GGML_MAX_DIMS; ++i) { + const auto row_stride = + tensor->nb[i - 1] * (i == 1 ? ((tensor->ne[0] / ggml_blck_size(tensor->type)) + tensor->padding[1]) + : tensor->ne[i - 1]); + if (tensor->ne[i] != row_stride) { + QNN_LOG_DEBUG("[%s]unsupported data type %s, ne%d: %ld, row_stride: %ld", + qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), (int)i, + (long)tensor->ne[i], (long)row_stride); + return false; + } + } break; default: - QNN_LOG_DEBUG("unsupported data type %s", type_name); + QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device), + ggml_type_name(tensor->type)); return false; } @@ -566,6 +583,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + constexpr const size_t kMaxNpuTensorSize = 8192L * 2048 + 8192 * 512 + 2048 * 512; constexpr const auto get_tensor_size = [](const ggml_tensor *tensor) -> size_t { return tensor->ne[0] * tensor->ne[1] * tensor->ne[2] * tensor->ne[3]; }; @@ -582,8 +600,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm QNN_LOG_DEBUG("[qnn-npu] src0 and src1 dimensions are not equal, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; - } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= - (8192 * 2048 + 8192 * 512 + 2048 * 512)) { + } else if (get_tensor_size(src0) + get_tensor_size(src1) + get_tensor_size(op) >= kMaxNpuTensorSize) { QNN_LOG_DEBUG("[qnn-npu] tensor size is too large, support/unsupported: %d/%d", ctx->support_op_count.load(), ++(ctx->unsupported_op_count)); return false; @@ -624,6 +641,7 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return true; } + auto *src0 = op->src[0]; if (op->op == GGML_OP_UNARY) { const auto unary_op = ggml_get_unary_op(op); if (unary_op == GGML_UNARY_OP_GELU && ctx->device == QNN_BACKEND_NPU) { @@ -637,7 +655,7 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return false; } - if (!op->src[0]) { + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op)) { QNN_LOG_DEBUG("src0 is nullptr"); return false; } @@ -647,7 +665,6 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return false; } - auto *src0 = op->src[0]; auto *src1 = op->src[1]; if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, op) || (kQnnBinaryOpsTable[op->op] && !ggml_qnn_supports_tensor(ctx, src1))) { From 7d572d065006d9361742f81c442495f382f80f2a Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 2 Dec 2024 13:22:18 +0800 Subject: [PATCH 12/34] disable tensor with view --- ggml/src/ggml-qnn/backend-ops.cpp | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index ea9671a95f4a8..b21283fea81e5 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -543,6 +543,11 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } + if (tensor->view_src && tensor->view_offs) { + QNN_LOG_DEBUG("[%s]tensor(%s) is a view", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type)); + return false; + } + switch (tensor->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: @@ -553,25 +558,6 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t ggml_type_name(tensor->type), ctx->supported_types); return false; } - - if (tensor->ne[0] != ggml_type_size(tensor->type)) { - QNN_LOG_DEBUG("[%s]unsupported data type %s, ne0: %ld, type_size: %ld", - qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), (long)tensor->ne[0], - (long)ggml_type_size(tensor->type)); - return false; - } - - for (size_t i = 1; i < GGML_MAX_DIMS; ++i) { - const auto row_stride = - tensor->nb[i - 1] * (i == 1 ? ((tensor->ne[0] / ggml_blck_size(tensor->type)) + tensor->padding[1]) - : tensor->ne[i - 1]); - if (tensor->ne[i] != row_stride) { - QNN_LOG_DEBUG("[%s]unsupported data type %s, ne%d: %ld, row_stride: %ld", - qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type), (int)i, - (long)tensor->ne[i], (long)row_stride); - return false; - } - } break; default: QNN_LOG_DEBUG("[%s]unsupported data type %s", qnn::get_backend_name(ctx->device), From a559e87308747ce69dcf0b94482054bc15b606c2 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 2 Dec 2024 14:13:41 +0800 Subject: [PATCH 13/34] wip --- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index b21283fea81e5..70bf0bef1542e 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -543,7 +543,7 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t return false; } - if (tensor->view_src && tensor->view_offs) { + if (tensor->view_src) { QNN_LOG_DEBUG("[%s]tensor(%s) is a view", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type)); return false; } From 348fd052d420f9661e8feca2555a4949bc98354b Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 2 Dec 2024 20:27:21 +0800 Subject: [PATCH 14/34] wip --- ggml/src/ggml-qnn/buffer.hpp | 8 +++++++- ggml/src/ggml-qnn/tensor.hpp | 18 ++++-------------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 5a0e86121da5d..9573e160b4176 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -66,7 +66,7 @@ class qnn_rpc_buffer : public qnn_buffer_interface { class qnn_mem_buffer : public qnn_buffer_interface { public: - qnn_mem_buffer(size_t size) { + explicit qnn_mem_buffer(const uint8_t *data, size_t size) { _buffer = reinterpret_cast(qnn::page_align_alloc(size)); if (!_buffer) { @@ -75,8 +75,14 @@ class qnn_mem_buffer : public qnn_buffer_interface { } _size = size; + + if (data) { + memcpy(_buffer, data, size); + } } + explicit qnn_mem_buffer(size_t size) : qnn_mem_buffer(nullptr, size) {} + ~qnn_mem_buffer() { // the free will do nothing if the _buffer is nullptr qnn::align_free(_buffer); diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 25502c38d0154..31fee800606d3 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -213,13 +213,8 @@ class ggml_qnn_tensor { return true; } - if (should_use_mem_handle()) { - if (_rpc_buffer) { - memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); - } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); - return false; - } + if (_rpc_buffer) { + memcpy(_rpc_buffer->get_buffer(), _buffer, _buffer_size); } // For CPU and GPU, the data is already in the tensor. @@ -234,13 +229,8 @@ class ggml_qnn_tensor { return true; } - if (should_use_mem_handle()) { - if (_rpc_buffer) { - memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); - } else { - QNN_LOG_WARN("[%s]can't find rpcmem from qnn mem handle", _tensor_name.c_str()); - return false; - } + if (_rpc_buffer) { + memcpy(_buffer, _rpc_buffer->get_buffer(), _buffer_size); } // For CPU and GPU, the data is already in the tensor. From 3f5e2ad12c94ef88443e1c0dddec47ee70c67687 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 3 Dec 2024 10:04:31 +0800 Subject: [PATCH 15/34] more log for view tensor --- ggml/src/ggml-qnn/backend-ops.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 70bf0bef1542e..f5e9f36a33b38 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -544,7 +544,11 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t } if (tensor->view_src) { - QNN_LOG_DEBUG("[%s]tensor(%s) is a view", qnn::get_backend_name(ctx->device), ggml_type_name(tensor->type)); + auto *src_tensor = tensor->view_src; + QNN_LOG_DEBUG("[%s]tensor(%s_%dx%dx%dx%d) is a view, src: %s_%dx%dx%dx%d", qnn::get_backend_name(ctx->device), + ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], + ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], + src_tensor->ne[3]); return false; } From 1de7403d7dc4df248e04f1820002bf91869d65b8 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 4 Dec 2024 10:19:14 +0800 Subject: [PATCH 16/34] re-enable view --- ggml/src/ggml-qnn/backend-ops.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f5e9f36a33b38..f4a820af38a8f 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -549,7 +549,6 @@ bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_t ggml_get_name(tensor), tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_get_name(src_tensor), src_tensor->ne[0], src_tensor->ne[1], src_tensor->ne[2], src_tensor->ne[3]); - return false; } switch (tensor->type) { @@ -673,6 +672,9 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso case GGML_OP_MUL_MAT: return ggml_qnn_supports_matmul_op(ctx, op); + case GGML_OP_VIEW: + return true; + default: return false; } From a77e3825aa45f35a6ce09f48317bbbdc86b501ec Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 4 Dec 2024 18:01:09 +0800 Subject: [PATCH 17/34] wip --- ggml/src/ggml-qnn/ggml-qnn.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index dfe45c36ea26f..60b0624da8c16 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -248,7 +248,7 @@ ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph * } bool ok = qnn::ggml_qnn_forward(device_ctx, node); if (!ok) { - QNN_LOG_DEBUG("error: op not supported %s (%s)\n", node->name, ggml_op_name(node->op)); + QNN_LOG_DEBUG("error: op not supported %s (%s)", node->name, ggml_op_name(node->op)); } } @@ -353,9 +353,9 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, } } else { if (setenv("LD_LIBRARY_PATH", path.c_str(), 1) == 0) { - QNN_LOG_DEBUG("%s backend setenv successfully\n", qnn::get_backend_name(device)); + QNN_LOG_DEBUG("%s backend setenv successfully", qnn::get_backend_name(device)); } else { - QNN_LOG_ERROR("%s backend setenv failure\n", qnn::get_backend_name(device)); + QNN_LOG_ERROR("%s backend setenv failure", qnn::get_backend_name(device)); } } #endif @@ -363,12 +363,12 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, auto instance = std::make_shared(path, dev_ctx->lib_name, "ggml"); auto result = instance->qnn_init(nullptr); if (result != 0) { - QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", qnn::get_backend_name(device)); + QNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why", qnn::get_backend_name(device)); return nullptr; } auto qnn_interface = instance->get_qnn_interface(); if (!qnn_interface) { - QNN_LOG_WARN("qnn subsystem failure\n"); + QNN_LOG_WARN("qnn subsystem failure"); return nullptr; } From baff0594607680b3a460afdbf8693856e19e56ca Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 6 Dec 2024 10:26:30 +0800 Subject: [PATCH 18/34] remove link android lib --- ggml/src/ggml-qnn/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt index b8d84d078e082..7bbb9be76b4f6 100644 --- a/ggml/src/ggml-qnn/CMakeLists.txt +++ b/ggml/src/ggml-qnn/CMakeLists.txt @@ -2,8 +2,7 @@ message(STATUS "Using QNN backend") if(CMAKE_SYSTEM_NAME STREQUAL "Android") find_library(LOG_LIB log) - find_library(ANDROID_LIB android) - set(QNN_LINK_LIBRARIES ${LOG_LIB} ${ANDROID_LIB}) + set(QNN_LINK_LIBRARIES ${LOG_LIB}) set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend") else() message(FATAL_ERROR "QNN now only available on Android") From d559a8e1abed989e592327af00fe6b36e3f7bedb Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 6 Dec 2024 18:21:43 +0800 Subject: [PATCH 19/34] set dimension at bind function --- ggml/src/ggml-qnn/tensor.hpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 31fee800606d3..8845075c11232 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -32,9 +32,8 @@ class ggml_qnn_tensor { if (!_tensor_name.empty()) { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); - QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); _dimensions = dimensions; update_params_from_ggml_tensor(tensor_type, data_type, rank); QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), @@ -69,7 +68,7 @@ class ggml_qnn_tensor { return true; } - Qnn_Tensor_t qnn_tensor = _qnn_tensor; + Qnn_Tensor_t qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { @@ -141,6 +140,7 @@ class ggml_qnn_tensor { const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } + const qnn_dimension_array_t &get_dimension_strides() const { return _dimension_strides; } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: @@ -161,6 +161,7 @@ class ggml_qnn_tensor { return true; } + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); if (should_use_mem_handle()) { if (!_rpc_buffer) { auto rpc_buffer = std::make_shared( @@ -280,6 +281,7 @@ class ggml_qnn_tensor { std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; + qnn_dimension_array_t _dimension_strides = {}; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; From 3bc9caa2aaad5ec57660df73f56afd57e855f249 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Fri, 6 Dec 2024 21:50:24 +0800 Subject: [PATCH 20/34] move graph traversal to backend-ops --- ggml/src/ggml-qnn/backend-ops.cpp | 42 ++++++++++++++++++++----------- ggml/src/ggml-qnn/backend-ops.hpp | 4 +-- ggml/src/ggml-qnn/ggml-qnn.cpp | 18 +++---------- 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f4a820af38a8f..68775eedebdb7 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -3,6 +3,8 @@ #include +#include "ggml-impl.h" + #include "graph.hpp" #include "logger.hpp" #include "op-config.hpp" @@ -624,7 +626,7 @@ bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggm namespace qnn { -bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { +bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { // Note that this function could be called before the device context is initialized if (op->op == GGML_OP_NONE) { return true; @@ -683,24 +685,34 @@ bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tenso return true; } -bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor) { - size_t unary_op_idx = tensor->op; - if (tensor->op == GGML_OP_UNARY) { - unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); - } +bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor *tensor = cgraph->nodes[i]; + if (ggml_is_empty(tensor)) { + continue; + } - auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; - if (unary_op) { - return unary_op(ctx, tensor->src[0], tensor); - } + size_t unary_op_idx = tensor->op; + if (tensor->op == GGML_OP_UNARY) { + unary_op_idx = kGgmlUnaryOpStart + ggml_get_unary_op(tensor); + } - auto binary_op = kQnnBinaryOpsTable[tensor->op]; - if (binary_op) { - return binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + bool ok = false; + auto unary_op = kQnnUnaryOpsTable[unary_op_idx]; + auto binary_op = kQnnBinaryOpsTable[tensor->op]; + if (unary_op) { + ok = unary_op(ctx, tensor->src[0], tensor); + } else if (binary_op) { + ok = binary_op(ctx, tensor->src[0], tensor->src[1], tensor); + } + + if (!ok) { + QNN_LOG_WARN("[%s]unsupported op %s", qnn::get_backend_name(ctx->device), ggml_op_desc(tensor)); + return false; + } } - QNN_LOG_WARN("[forward]unsupported op %s", ggml_op_desc(tensor)); - return false; + return true; } } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 3df7f4a98a146..c49c4d6dc19d7 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,7 +6,7 @@ namespace qnn { -bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); -bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); +bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); +bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph); } // namespace qnn diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 60b0624da8c16..c0ec9e174a95a 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -239,20 +239,8 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) } ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, ggml_cgraph *cgraph) { - enum ggml_status result = GGML_STATUS_SUCCESS; - auto *device_ctx = get_device_context(backend->device); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor *node = cgraph->nodes[i]; - if (ggml_is_empty(node)) { - continue; - } - bool ok = qnn::ggml_qnn_forward(device_ctx, node); - if (!ok) { - QNN_LOG_DEBUG("error: op not supported %s (%s)", node->name, ggml_op_name(node->op)); - } - } - - return result; + return qnn::device_compute_graph(get_device_context(backend->device), cgraph) ? GGML_STATUS_SUCCESS + : GGML_STATUS_FAILED; } constexpr const ggml_backend_i ggml_backend_qnn_interface = { @@ -408,7 +396,7 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); - return qnn::ggml_qnn_supports_op(device_ctx, op); + return qnn::device_supports_op(device_ctx, op); } bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { From a3c68fcbc3ef7480e7bd066f8a08741015da7a17 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Sat, 7 Dec 2024 15:50:56 +0800 Subject: [PATCH 21/34] wip --- ggml/src/ggml-qnn/backend-ops.cpp | 2 +- ggml/src/ggml-qnn/tensor.hpp | 5 +++-- ggml/src/ggml-qnn/utils.cpp | 14 ++++++++++++++ ggml/src/ggml-qnn/utils.hpp | 2 ++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 68775eedebdb7..ac96eaeb81f45 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -444,7 +444,7 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { static_assert(sizeof(kQnnUnaryOpsTable) / sizeof(kQnnUnaryOpsTable[0]) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT), "GGML_OP_COUNT does not match the size of the kQnnUnaryOpsTable table"); -static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { +constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP qnn_binary_op_impl, // GGML_OP_ADD diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index 8845075c11232..b1106a019ec24 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -97,6 +97,7 @@ class ggml_qnn_tensor { return false; } + _buffer_dimensions = get_internal_dimension(tensor->nb, tensor->rank); QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); return true; @@ -140,7 +141,7 @@ class ggml_qnn_tensor { const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } - const qnn_dimension_array_t &get_dimension_strides() const { return _dimension_strides; } + const qnn_dimension_array_t &get_buffer_dimensions() const { return _buffer_dimensions; } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: @@ -281,7 +282,7 @@ class ggml_qnn_tensor { std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; - qnn_dimension_array_t _dimension_strides = {}; + qnn_dimension_array_t _buffer_dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index dbfb3f51fd246..8da80aa780282 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -39,6 +39,20 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, return internal_dims; } +qnn_dimension_array_t get_internal_buffer_dimension(const ggml_stride_array_t &nb, ggml_type type, uint32_t rank) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); + + const auto block_size = ggml_blck_size(type); + qnn_dimension_array_t internal_dims = {}; + internal_dims[rank - 1] = nb[1] * block_size / nb[0]; + for (uint32_t i = 1; i < rank; i++) { + internal_dims[rank - i - 1] = nb[i] * block_size / mb[0]; + } + + return internal_dims; +} + // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 4a10c277691af..8947ca28a0b1f 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -21,9 +21,11 @@ namespace qnn { using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; using qnn_dimension_array_t = std::array; qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); +qnn_dimension_array_t get_internal_buffer_dimension(const ggml_stride_array_t &nb, ggml_type type, uint32_t rank); uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_ggml_type_name(ggml_type type); From eeb3b6afbb2bf2455b567b0c8146552a822cb899 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 7 Dec 2024 19:13:48 +0800 Subject: [PATCH 22/34] add get_view_internal_dimension to obtain the tensor view source dimension --- ggml/src/ggml-qnn/tensor.hpp | 7 ++++--- ggml/src/ggml-qnn/utils.cpp | 31 +++++++++++++++++++++---------- ggml/src/ggml-qnn/utils.hpp | 2 +- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index b1106a019ec24..de79d905deac9 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -97,7 +97,7 @@ class ggml_qnn_tensor { return false; } - _buffer_dimensions = get_internal_dimension(tensor->nb, tensor->rank); + _view_source_dimensions = get_view_internal_dimension(tensor, _element_offset); QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); return true; @@ -141,7 +141,7 @@ class ggml_qnn_tensor { const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } - const qnn_dimension_array_t &get_buffer_dimensions() const { return _buffer_dimensions; } + const qnn_dimension_array_t &get_view_source_dimensions() const { return _view_source_dimensions; } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: @@ -282,7 +282,8 @@ class ggml_qnn_tensor { std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; - qnn_dimension_array_t _buffer_dimensions = {}; + qnn_dimension_array_t _view_source_dimensions = {}; + size_t _element_offset = 0; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 8da80aa780282..46125b808b977 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -39,18 +39,29 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, return internal_dims; } -qnn_dimension_array_t get_internal_buffer_dimension(const ggml_stride_array_t &nb, ggml_type type, uint32_t rank) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); - GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); - - const auto block_size = ggml_blck_size(type); - qnn_dimension_array_t internal_dims = {}; - internal_dims[rank - 1] = nb[1] * block_size / nb[0]; - for (uint32_t i = 1; i < rank; i++) { - internal_dims[rank - i - 1] = nb[i] * block_size / mb[0]; +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) { + const auto rank = get_ggml_tensor_rank(tensor); + if (!tensor->view_src) { + element_offset_out = 0; + return get_internal_dimension(tensor->ne, rank); } - return internal_dims; + element_offset_out = 0; + auto *parent_tensor = tensor; + do { + element_offset_out += parent_tensor->view_offs; + parent_tensor = parent_tensor->view_src; + } while (parent_tensor->view_src); + + const auto parent_rank = get_ggml_tensor_rank(parent_tensor); + GGML_ASSERT(parent_tensor->type == tensor->type); + GGML_ASSERT(parent_rank == rank); + + const auto block_size = ggml_blck_size(tensor->type); + element_offset_out = + element_offset_out * block_size / tensor->nb[0]; // calculate the element offset in the view tensor + + return get_internal_dimension(parent_tensor->ne, parent_rank); } // TODO: mapping more ggml data type to QNN data type diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index 8947ca28a0b1f..1ec0af4c96f77 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -25,7 +25,7 @@ using ggml_stride_array_t = size_t[GGML_MAX_DIMS]; using qnn_dimension_array_t = std::array; qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); -qnn_dimension_array_t get_internal_buffer_dimension(const ggml_stride_array_t &nb, ggml_type type, uint32_t rank); +qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offser_out); uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); const char *get_ggml_type_name(ggml_type type); From 8251273c56d1708834f53f0fca3097ab8293f738 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sat, 7 Dec 2024 23:22:30 +0800 Subject: [PATCH 23/34] use _view_source_dimensions to allocate qnn tensor --- ggml/src/ggml-qnn/tensor.hpp | 5 +++-- ggml/src/ggml-qnn/utils.cpp | 11 ++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index de79d905deac9..c78278fc78574 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -35,6 +35,7 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); _dimensions = dimensions; + _view_source_dimensions = dimensions; update_params_from_ggml_tensor(tensor_type, data_type, rank); QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], @@ -92,12 +93,12 @@ class ggml_qnn_tensor { } bool bind_ggml_tensor(ggml_tensor *tensor) { + _view_source_dimensions = get_view_internal_dimension(tensor, _element_offset); if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); return false; } - _view_source_dimensions = get_view_internal_dimension(tensor, _element_offset); QNN_LOG_DEBUG("[%s][%s]bind to ggml tensor(%s)", get_backend_name(_device), _tensor_name.c_str(), ggml_get_name(tensor)); return true; @@ -162,7 +163,7 @@ class ggml_qnn_tensor { return true; } - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _view_source_dimensions.data()); if (should_use_mem_handle()) { if (!_rpc_buffer) { auto rpc_buffer = std::make_shared( diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index 46125b808b977..eaabe60cdb262 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -40,19 +40,16 @@ qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, } qnn_dimension_array_t get_view_internal_dimension(const ggml_tensor *tensor, size_t &element_offset_out) { - const auto rank = get_ggml_tensor_rank(tensor); - if (!tensor->view_src) { - element_offset_out = 0; - return get_internal_dimension(tensor->ne, rank); - } element_offset_out = 0; + auto *parent_tensor = tensor; - do { + while (parent_tensor->view_src) { element_offset_out += parent_tensor->view_offs; parent_tensor = parent_tensor->view_src; - } while (parent_tensor->view_src); + } + const auto rank = get_ggml_tensor_rank(tensor); const auto parent_rank = get_ggml_tensor_rank(parent_tensor); GGML_ASSERT(parent_tensor->type == tensor->type); GGML_ASSERT(parent_rank == rank); From e9d07c1d0349be9cfa9fc94948cd55ba94971b06 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 8 Dec 2024 17:26:02 +0800 Subject: [PATCH 24/34] add place holder function ggml_backend_qnn_cpy_tensor_async --- ggml/src/ggml-qnn/ggml-qnn.cpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index c0ec9e174a95a..351cd84d27ac9 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -85,6 +85,11 @@ static_assert(sizeof(kDeviceCaps) / sizeof(kDeviceCaps[0]) == GGML_QNN_MAX_DEVIC "The number of qnn devices should be equal to GGML_QNN_MAX_DEVICES"); static_assert(kDeviceCaps[QNN_BACKEND_NPU].type == GGML_BACKEND_DEVICE_TYPE_ACCEL, "The NPU device should be an accelerator device"); +static_assert(kDeviceCaps[QNN_BACKEND_GPU].type == GGML_BACKEND_DEVICE_TYPE_GPU, + "The NPU device should be an accelerator device"); + +static_assert(kDeviceCaps[QNN_BACKEND_CPU].type == GGML_BACKEND_DEVICE_TYPE_CPU, + "The NPU device should be an accelerator device"); ggml_backend_qnn_device_context *get_device_context(ggml_backend_dev_t dev) { return reinterpret_cast(dev->context); @@ -127,8 +132,7 @@ void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml memcpy(data, (const char *)tensor->data + offset, size); } -bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor *src, - struct ggml_tensor *dst) { +bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) { GGML_UNUSED(buffer); if (ggml_backend_buffer_is_host(src->buffer)) { memcpy(dst->data, src->data, ggml_nbytes(src)); @@ -212,6 +216,15 @@ void ggml_backend_qnn_free(ggml_backend_t backend) { } } +bool ggml_backend_qnn_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor *src, + ggml_tensor *dst) { + GGML_UNUSED(backend_src); + GGML_UNUSED(backend_dst); + GGML_UNUSED(src); + GGML_UNUSED(dst); + return false; +} + ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(ggml_backend_dev_t dev) { static ggml_backend_buffer_type ggml_backend_qnn_buffer_types[GGML_QNN_MAX_DEVICES]; auto *dev_ctx = get_device_context(dev); @@ -248,7 +261,7 @@ constexpr const ggml_backend_i ggml_backend_qnn_interface = { /* .free = */ ggml_backend_qnn_free, /* .set_tensor_async = */ nullptr, /* .get_tensor_async = */ nullptr, - /* .cpy_tensor_async = */ nullptr, + /* .cpy_tensor_async = */ ggml_backend_qnn_cpy_tensor_async, /* .synchronize = */ nullptr, /* .graph_plan_create = */ nullptr, /* .graph_plan_free = */ nullptr, @@ -285,7 +298,7 @@ enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t d return kDeviceCaps[get_device_context(dev)->device].type; } -void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props *props) { +void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props *props) { props->name = ggml_backend_qnn_device_get_name(dev); props->description = ggml_backend_qnn_device_get_description(dev); props->type = ggml_backend_qnn_device_get_type(dev); @@ -304,6 +317,8 @@ ggml_guid_t ggml_backend_qnn_guid() { return &guid; } +bool ggml_backend_is_qnn(ggml_backend_t backend) { return ggml_guid_matches(backend->guid, ggml_backend_qnn_guid()); } + ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, const char *extend_lib_search_path) { if (!extend_lib_search_path) { extend_lib_search_path = GGML_QNN_DEFAULT_LIB_SEARCH_PATH; @@ -393,7 +408,7 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t return ggml_backend_cpu_buffer_from_ptr(ptr, size); } -bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { +bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor *op) { // Note that this function could be called before the device context is initialized auto *device_ctx = get_device_context(dev); return qnn::device_supports_op(device_ctx, op); @@ -410,7 +425,7 @@ bool ggml_backend_qnn_device_offload_op(ggml_backend_dev_t dev, const ggml_tenso return false; } -const struct ggml_backend_device_i ggml_backend_qnn_device_interface = { +constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { /* .get_name = */ ggml_backend_qnn_device_get_name, /* .get_description = */ ggml_backend_qnn_device_get_description, /* .get_memory = */ ggml_backend_qnn_device_get_memory, From 4f3f4f0dda2324b33b5fab9e50f1461072a4ecf1 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 8 Dec 2024 18:47:19 +0800 Subject: [PATCH 25/34] add ggml_qnn_aggregate_op_config --- ggml/src/ggml-qnn/op-config.cpp | 8 ++++++ ggml/src/ggml-qnn/op-config.hpp | 48 +++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 7ee5eb38a17a8..145e35dfff667 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -427,6 +427,14 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap return true; } +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs) { diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 27571563309a8..388bc90bf69f9 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -82,6 +82,54 @@ class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { DISABLE_MOVE(ggml_qnn_single_op_config); }; +class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { +public: + explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) + : _name(name), _qnn_instance(qnn_instance) {} + + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { + for (auto &op : _operations) { + if (!op->add_op_to_graph(graph_handle)) { + return false; + } + } + return true; + } + + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + + void unbind_input_tensors() override { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } + } + + void unbind_output_tensors() override { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } + } + + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + +protected: + std::string _name; + std::shared_ptr _qnn_instance; + + std::vector _operations; + qnn_tensor_array_t _tensor_inputs; + qnn_tensor_array_t _tensor_outputs; + std::vector _qnn_tensor_inputs; + std::vector _qnn_tensor_outputs; + +private: + DISABLE_COPY(ggml_qnn_aggregate_op_config); + DISABLE_MOVE(ggml_qnn_aggregate_op_config); +}; + class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { public: ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) From 807b22b3a153996ae89a1d7f8a7b337196c53629 Mon Sep 17 00:00:00 2001 From: Hongrui Chen Date: Sun, 8 Dec 2024 21:49:30 +0800 Subject: [PATCH 26/34] make matmul based on ggml_qnn_aggregate_op_config --- ggml/src/ggml-qnn/op-config.cpp | 97 +++++++-------------------------- ggml/src/ggml-qnn/op-config.hpp | 22 +------- 2 files changed, 23 insertions(+), 96 deletions(-) diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 145e35dfff667..4ee3fb25bff98 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -280,6 +280,14 @@ bool ggml_qnn_single_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph return true; } +bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -293,20 +301,21 @@ bool ggml_qnn_matmul_op_config::initialize_op_nodes(QNNBackend device, Qnn_Graph create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); // create output tensor - qnn_tensor_array_t mat_mul_tensor_outputs; params.name_prefix = "dst"; params.is_input = false; - create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); // create convert nodes qnn_tensor_array_t mat_mul_tensor_inputs = _tensor_inputs; + qnn_tensor_array_t mat_mul_tensor_outputs = _tensor_outputs; if (!create_convert_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs)) { QNN_LOG_ERROR("create convert nodes failed"); return false; } - mat_mul_tensor_inputs.front() = create_gather_nodes(device, graph_handle, tensor_rank, _tensor_inputs.front(), - _tensor_inputs.back()->get_dimensions()); + mat_mul_tensor_inputs.front() = + create_gather_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs.front(), + mat_mul_tensor_inputs.back()->get_dimensions()); return create_mat_mul_nodes(device, graph_handle, tensor_rank, mat_mul_tensor_inputs, mat_mul_tensor_outputs); } @@ -365,15 +374,15 @@ qnn_tensor_ptr_t ggml_qnn_matmul_op_config::create_gather_nodes(QNNBackend devic qnn_dimension_array_t intermediate_dimensions = input_dimensions; intermediate_dimensions[rank - 3] = output_dimensions[rank - 3]; qnn_tensor_ptr_t gather0_out; - _gather0 = create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, - graph_handle, _qnn_instance, gather0_out); + _operations.push_back(create_node(_name + "_gather0", rank, rank - 3, intermediate_dimensions, tensor_input, device, + graph_handle, _qnn_instance, gather0_out)); if (rank == 3) { return gather0_out; } qnn_tensor_ptr_t gather1_out; - _gather1 = create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, graph_handle, - _qnn_instance, gather1_out); + _operations.push_back(create_node(_name + "_gather1", rank, rank - 4, output_dimensions, gather0_out, device, + graph_handle, _qnn_instance, gather1_out)); return gather1_out; } @@ -389,7 +398,6 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap auto tensor_type = get_tensor_type(tensor_inputs); QNN_LOG_DEBUG("input tensor type: %s", qnn_datatype_to_string(tensor_type)); - _input_converts.resize(tensor_inputs.size()); for (size_t i = 0; i < tensor_inputs.size(); ++i) { // create input convert nodes auto convert_in = tensor_inputs[i]; @@ -406,7 +414,7 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap convert->set_input_tensors({convert_in}); convert->set_output_tensors({convert_out}); tensor_inputs[i] = convert_out; - _input_converts[i] = convert; + _operations.push_back(convert); } if (tensor_outputs.front()->get_data_type() != tensor_type) { @@ -421,20 +429,12 @@ bool ggml_qnn_matmul_op_config::create_convert_nodes(QNNBackend device, Qnn_Grap output_convert->set_input_tensors({convert_in}); output_convert->set_output_tensors({convert_out}); tensor_outputs.front() = convert_in; - _output_convert = output_convert; + _operations.push_back(output_convert); } return true; } -bool ggml_qnn_aggregate_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); -} - -bool ggml_qnn_aggregate_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); -} - bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs) { @@ -523,66 +523,11 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap transpose_out->set_input_tensors(tensors); transpose_out->set_output_tensors(tensor_outputs); - _mat_mul = mat_mul; - _transpose_out = transpose_out; + _operations.push_back(mat_mul); + _operations.push_back(transpose_out); return true; } -bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { - for (auto &convert : _input_converts) { - if (convert && !convert->add_op_to_graph(graph_handle)) { - return false; - } - } - - if (_gather0 && !_gather0->add_op_to_graph(graph_handle)) { - return false; - } - - if (_gather1 && !_gather1->add_op_to_graph(graph_handle)) { - return false; - } - - return _mat_mul->add_op_to_graph(graph_handle) && _transpose_out->add_op_to_graph(graph_handle) && - (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); -} - -bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { - return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); -} - -bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { - if (_output_convert) { - return _output_convert->bind_output_tensors(tensor_outputs); - } else { - return _transpose_out->bind_output_tensors(tensor_outputs); - } -} - -void ggml_qnn_matmul_op_config::unbind_input_tensors() { - _mat_mul->unbind_input_tensors(); - for (auto &convert : _input_converts) { - if (convert) { - convert->unbind_input_tensors(); - } - } -} - -void ggml_qnn_matmul_op_config::unbind_output_tensors() { - _transpose_out->unbind_output_tensors(); - if (_output_convert) { - _output_convert->unbind_output_tensors(); - } -} - -std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { - if (_output_convert) { - return _output_convert->get_qnn_output_tensors(); - } else { - return _transpose_out->get_qnn_output_tensors(); - } -} - ggml_op_constructor_t create_op_constructor(const std::string &op_name) { if (op_name == QNN_OP_MAT_MUL) { // For QNN_OP_MAT_MUL, we need to transpose the input tensor diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 388bc90bf69f9..1b3bd0654657f 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -130,21 +130,14 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { DISABLE_MOVE(ggml_qnn_aggregate_op_config); }; -class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { +class ggml_qnn_matmul_op_config : public ggml_qnn_aggregate_op_config { public: ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) - : _name(name), _qnn_instance(qnn_instance) {} + : ggml_qnn_aggregate_op_config(name, qnn_instance) {} bool initialize_op_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) override; - bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; - bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; - bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; - void unbind_input_tensors() override; - void unbind_output_tensors() override; - std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() override; private: qnn_tensor_ptr_t create_gather_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, @@ -154,17 +147,6 @@ class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, qnn_tensor_array_t &tensor_inputs, qnn_tensor_array_t &tensor_outputs); - std::string _name; - std::shared_ptr _qnn_instance; - qnn_op_config_ptr_t _transpose_out; - qnn_op_config_ptr_t _mat_mul; - qnn_op_config_ptr_t _gather0; - qnn_op_config_ptr_t _gather1; - std::vector _input_converts; - qnn_op_config_ptr_t _output_convert; - qnn_tensor_array_t _tensor_inputs; - std::vector _qnn_tensor_inputs; - DISABLE_COPY(ggml_qnn_matmul_op_config); DISABLE_MOVE(ggml_qnn_matmul_op_config); }; From c81bba9f2128332bfa4ff8ed7b6c5f1893ee4a43 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Mon, 9 Dec 2024 21:01:42 +0800 Subject: [PATCH 27/34] wip --- ggml/src/ggml-qnn/op-config.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp index 4ee3fb25bff98..b3c84b5435095 100644 --- a/ggml/src/ggml-qnn/op-config.cpp +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -440,7 +440,7 @@ bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_Grap qnn_tensor_array_t &tensor_outputs) { /* - * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please refer to: * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) * But the dimensions of the tensor are stored in different order. * For example, a 2x3 matrix: @@ -536,13 +536,6 @@ ggml_op_constructor_t create_op_constructor(const std::string &op_name) { QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s", instance_name.c_str()); return std::make_unique(instance_name, qnn_instance); }; - } else if (op_name == QNN_OP_TRANSPOSE) { - return [](const std::string &instance_name, - std::shared_ptr qnn_instance) -> std::unique_ptr { - return std::make_unique( - instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, QNN_OP_TRANSPOSE_PARAM_PERM, - QNN_DATATYPE_UINT_32, 4 * sizeof(uint32_t), qnn_instance); - }; } return [op_name](const std::string &instance_name, From a662a84985fe18acad4f3608db7f7f3113af0b48 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 10 Dec 2024 19:08:52 +0800 Subject: [PATCH 28/34] manually specify the order of op destruct --- ggml/src/ggml-qnn/op-config.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 1b3bd0654657f..a05b75ade7e6a 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -87,6 +87,14 @@ class ggml_qnn_aggregate_op_config : public ggml_qnn_op_config { explicit ggml_qnn_aggregate_op_config(const std::string &name, std::shared_ptr qnn_instance) : _name(name), _qnn_instance(qnn_instance) {} + ~ggml_qnn_aggregate_op_config() { + _qnn_tensor_inputs.clear(); + _qnn_tensor_outputs.clear(); + _tensor_inputs.clear(); + _tensor_outputs.clear(); + _operations.clear(); + } + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override { for (auto &op : _operations) { if (!op->add_op_to_graph(graph_handle)) { From 2d6813c643eed38829d2285b81f4d859875674c6 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 10 Dec 2024 22:28:47 +0800 Subject: [PATCH 29/34] skip register qnn-cpu backend --- ggml/src/ggml-qnn/ggml-qnn.cpp | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index 351cd84d27ac9..c7c5f45e5b521 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -450,27 +450,32 @@ constexpr const ggml_backend_device_i ggml_backend_qnn_device_interface = { */ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { - std::array, GGML_QNN_MAX_DEVICES> device_contexts; - std::array devices; + std::vector> device_contexts; + std::vector devices; explicit ggml_backend_qnn_reg_impl(ggml_backend_reg_i interface) { context = this; iface = interface; QNN_LOG_DEBUG("qnn backend registry init"); - for (int i = 0; i < GGML_QNN_MAX_DEVICES; i++) { - const auto device_enum = (QNNBackend)(GGML_QNN_MAX_DEVICES - 1 - i); // init from the last device, i.e. NPU - device_contexts[i] = std::make_unique( + for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { + const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU + if (device_enum == QNN_BACKEND_CPU) { + continue; + } + + device_contexts.emplace_back(std::make_unique( /* .device = */ device_enum, // init from the last device, i.e. NPU /* .threads = */ 1, /* .name = */ qnn::get_backend_name(device_enum), /* .lib_name = */ kDeviceCaps[device_enum].lib_name, - /* .supported_types = */ kDeviceCaps[device_enum].supported_types); + /* .supported_types = */ kDeviceCaps[device_enum].supported_types)); - auto &device = devices[i]; - device.iface = ggml_backend_qnn_device_interface; - device.reg = this; - device.context = device_contexts[i].get(); + devices.emplace_back(ggml_backend_device{ + /* iface = */ ggml_backend_qnn_device_interface, + /* reg = */ this, + /* context = */ device_contexts.back().get(), + }); } } }; From 7fffb37ff1cc1c6ae76f86a7aea0766cadcf1088 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 10 Dec 2024 22:33:26 +0800 Subject: [PATCH 30/34] disable view op again --- ggml/src/ggml-qnn/backend-ops.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index ac96eaeb81f45..bfa31ede053d2 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -674,9 +674,6 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor case GGML_OP_MUL_MAT: return ggml_qnn_supports_matmul_op(ctx, op); - case GGML_OP_VIEW: - return true; - default: return false; } From 270facaf8e25728ae213ce1d97450c8cedbd19f5 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 10 Dec 2024 23:24:54 +0800 Subject: [PATCH 31/34] remove _view_source_dimensions --- ggml/src/ggml-qnn/tensor.hpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c78278fc78574..833c620971e0d 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -33,9 +33,9 @@ class ggml_qnn_tensor { QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); } - QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); _dimensions = dimensions; - _view_source_dimensions = dimensions; + QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); + QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); update_params_from_ggml_tensor(tensor_type, data_type, rank); QNN_LOG_DEBUG("[%s][%s]created, rank: %d, dims: [%d, %d, %d, %d], type: %s", get_backend_name(device), _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], @@ -69,7 +69,7 @@ class ggml_qnn_tensor { return true; } - Qnn_Tensor_t qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); + Qnn_Tensor_t qnn_tensor = _qnn_tensor; auto qnn_interface = _qnn_instance->get_qnn_interface(); auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); if (error != QNN_SUCCESS) { @@ -93,7 +93,6 @@ class ggml_qnn_tensor { } bool bind_ggml_tensor(ggml_tensor *tensor) { - _view_source_dimensions = get_view_internal_dimension(tensor, _element_offset); if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { QNN_LOG_WARN("[%s]failed to bind ggml tensor(%s)", _tensor_name.c_str(), ggml_get_name(tensor)); return false; @@ -142,7 +141,6 @@ class ggml_qnn_tensor { const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } - const qnn_dimension_array_t &get_view_source_dimensions() const { return _view_source_dimensions; } uint32_t get_qnn_tensor_id() const { return QNN_TENSOR_GET_ID(_qnn_tensor); } private: @@ -163,7 +161,6 @@ class ggml_qnn_tensor { return true; } - QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _view_source_dimensions.data()); if (should_use_mem_handle()) { if (!_rpc_buffer) { auto rpc_buffer = std::make_shared( @@ -283,8 +280,6 @@ class ggml_qnn_tensor { std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); qnn_dimension_array_t _dimensions = {}; - qnn_dimension_array_t _view_source_dimensions = {}; - size_t _element_offset = 0; Qnn_GraphHandle_t _graph_handle = nullptr; qnn_buffer_ptr _rpc_buffer; From 260e1196e22f7da2bee31cca553c600e3946baff Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 10 Dec 2024 23:40:16 +0800 Subject: [PATCH 32/34] add nop for reshape and view ops --- ggml/src/ggml-qnn/backend-ops.cpp | 62 +++++++++++++++---------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index bfa31ede053d2..f9ff39ddd97c1 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -365,37 +365,37 @@ constexpr const ggml_qnn_unary_op_t kQnnUnaryOpsTable[] = { nullptr, // GGML_OP_MUL_MAT_ID nullptr, // GGML_OP_OUT_PROD - nullptr, // GGML_OP_SCALE - nullptr, // GGML_OP_SET - nullptr, // GGML_OP_CPY - nullptr, // GGML_OP_CONT - nullptr, // GGML_OP_RESHAPE - qnn_unary_nop_impl, // GGML_OP_VIEW - qnn_unary_op_impl, // GGML_OP_PERMUTE - nullptr, // GGML_OP_TRANSPOSE - qnn_unary_nop_impl, // GGML_OP_GET_ROWS - nullptr, // GGML_OP_GET_ROWS_BACK - nullptr, // GGML_OP_DIAG - nullptr, // GGML_OP_DIAG_MASK_INF - nullptr, // GGML_OP_DIAG_MASK_ZERO - nullptr, // GGML_OP_SOFT_MAX - nullptr, // GGML_OP_SOFT_MAX_BACK - nullptr, // GGML_OP_ROPE - nullptr, // GGML_OP_ROPE_BACK - nullptr, // GGML_OP_CLAMP - nullptr, // GGML_OP_CONV_TRANSPOSE_1D - nullptr, // GGML_OP_IM2COL - nullptr, // GGML_OP_IM2COL_BACK - nullptr, // GGML_OP_CONV_TRANSPOSE_2D - nullptr, // GGML_OP_POOL_1D - nullptr, // GGML_OP_POOL_2D - nullptr, // GGML_OP_POOL_2D_BACK - nullptr, // GGML_OP_UPSCALE - nullptr, // GGML_OP_PAD - nullptr, // GGML_OP_ARANGE - nullptr, // GGML_OP_TIMESTEP_EMBEDDING - nullptr, // GGML_OP_ARGSORT - nullptr, // GGML_OP_LEAKY_RELU + nullptr, // GGML_OP_SCALE + nullptr, // GGML_OP_SET + nullptr, // GGML_OP_CPY + nullptr, // GGML_OP_CONT + qnn_unary_nop_impl, // GGML_OP_RESHAPE + qnn_unary_nop_impl, // GGML_OP_VIEW + qnn_unary_nop_impl, // GGML_OP_PERMUTE + qnn_unary_nop_impl, // GGML_OP_TRANSPOSE + qnn_unary_nop_impl, // GGML_OP_GET_ROWS + nullptr, // GGML_OP_GET_ROWS_BACK + nullptr, // GGML_OP_DIAG + nullptr, // GGML_OP_DIAG_MASK_INF + nullptr, // GGML_OP_DIAG_MASK_ZERO + nullptr, // GGML_OP_SOFT_MAX + nullptr, // GGML_OP_SOFT_MAX_BACK + nullptr, // GGML_OP_ROPE + nullptr, // GGML_OP_ROPE_BACK + nullptr, // GGML_OP_CLAMP + nullptr, // GGML_OP_CONV_TRANSPOSE_1D + nullptr, // GGML_OP_IM2COL + nullptr, // GGML_OP_IM2COL_BACK + nullptr, // GGML_OP_CONV_TRANSPOSE_2D + nullptr, // GGML_OP_POOL_1D + nullptr, // GGML_OP_POOL_2D + nullptr, // GGML_OP_POOL_2D_BACK + nullptr, // GGML_OP_UPSCALE + nullptr, // GGML_OP_PAD + nullptr, // GGML_OP_ARANGE + nullptr, // GGML_OP_TIMESTEP_EMBEDDING + nullptr, // GGML_OP_ARGSORT + nullptr, // GGML_OP_LEAKY_RELU nullptr, // GGML_OP_FLASH_ATTN_EXT nullptr, // GGML_OP_FLASH_ATTN_BACK From 1a6f249f5db038be1af0526767f5c753c3dfeba7 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 11 Dec 2024 00:13:56 +0800 Subject: [PATCH 33/34] add log --- ggml/src/ggml-qnn/backend-ops.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index f9ff39ddd97c1..990338c953524 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -683,6 +683,7 @@ bool device_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor } bool device_compute_graph(ggml_backend_qnn_device_context *ctx, ggml_cgraph *cgraph) { + QNN_LOG_DEBUG("[%s]compute graph, nodes count: %d", qnn::get_backend_name(ctx->device), (int)cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor *tensor = cgraph->nodes[i]; if (ggml_is_empty(tensor)) { From 925d51e2879adb9cf00486bb113a6d9c36bf8921 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 11 Dec 2024 10:40:57 +0800 Subject: [PATCH 34/34] add comment --- ggml/src/ggml-qnn/ggml-qnn.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp index c7c5f45e5b521..933016a62878e 100644 --- a/ggml/src/ggml-qnn/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn/ggml-qnn.cpp @@ -461,6 +461,10 @@ struct ggml_backend_qnn_reg_impl : ggml_backend_reg { for (size_t i = 0; i < QNN_BACKEND_COUNT; i++) { const auto device_enum = (QNNBackend)(QNN_BACKEND_COUNT - 1 - i); // init from the last device, i.e. NPU if (device_enum == QNN_BACKEND_CPU) { + /* + * here we skip the initialization of CPU device, + * cause it'll block unsupported ops fallback to ggml cpu backend + */ continue; }