diff --git a/ggml/src/ggml-qnn.cpp b/ggml/src/ggml-qnn.cpp index bc88ba0f4de45..21a7dee1c99ef 100644 --- a/ggml/src/ggml-qnn.cpp +++ b/ggml/src/ggml-qnn.cpp @@ -51,12 +51,30 @@ struct qnn_device_caps { const char *description; const char *lib_name; enum ggml_backend_dev_type type; + + // TODO: should get this caps from device + std::unordered_set supported_types; }; const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{ - { "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */ - { "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */ - { "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */ + { "qnn-cpu", + "Qualcomm Kryo CPU", + "libQnnCpu.so", + GGML_BACKEND_DEVICE_TYPE_CPU, + { GGML_TYPE_F32, + GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul + { "qnn-gpu", + "Qualcomm Adreno GPU", + "libQnnGpu.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + { GGML_TYPE_F32, + GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul + { "qnn-npu", + "Qualcomm NPU", + "libQnnHtp.so", + GGML_BACKEND_DEVICE_TYPE_GPU, + { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16, + GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul }; class ggml_backend_qnn_buffer_context { @@ -340,9 +358,10 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe props->type = ggml_backend_qnn_device_get_type(dev); ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* async */ false, - /* host_buffer */ false, - /* events */ false, + /* async */ false, + /* host_buffer */ false, + /* buffer_from_host_ptr */ false, + /* events */ false, }; } @@ -412,6 +431,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev, dev_ctx->instance = instance; dev_ctx->qnn_interface = qnn_interface; dev_ctx->socinfo = instance->get_soc_info(); + dev_ctx->supported_types = kDeviceCaps[device_index].supported_types; ggml_backend_t qnn_backend = new ggml_backend{ /* .guid = */ ggml_backend_qnn_guid(), @@ -440,8 +460,8 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t } bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) { - GGML_UNUSED(dev); - return qnn::ggml_qnn_supports_op(op); + auto *device_ctx = get_device_context(dev); + return qnn::ggml_qnn_supports_op(device_ctx, op); } bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { diff --git a/ggml/src/ggml-qnn/backend-ops.cpp b/ggml/src/ggml-qnn/backend-ops.cpp index 9c6e5709c8189..d20069874a7c3 100644 --- a/ggml/src/ggml-qnn/backend-ops.cpp +++ b/ggml/src/ggml-qnn/backend-ops.cpp @@ -108,8 +108,8 @@ std::string get_graph_key(const std::string &op_name, const std::array &outputs) { constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) { char buffer[256] = {}; - snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1], - (long)tensor->ne[2], (long)tensor->ne[3]); + snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], + (long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type)); key += buffer; }; @@ -117,32 +117,11 @@ std::string get_graph_key(const std::string &op_name, const std::arraytype); return graph_key; } -qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) { - if (op_name == QNN_OP_MAT_MUL) { - // For QNN_OP_MAT_MUL, we need to transpose the input tensor - return [](const std::string &name) { - auto config = std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL); - Qnn_Scalar_t scalar = QNN_SCALAR_INIT; - scalar.dataType = QNN_DATATYPE_BOOL_8; - scalar.bool8Value = true; - config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar); - QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0); - return config; - }; - } - - return [op_name](const std::string &name) { - return std::make_unique(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name); - }; -} - constexpr const char *kGgmlOpToQnnOp[] = { nullptr, // GGML_OP_NONE nullptr, // GGML_OP_DUP @@ -278,7 +257,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c return nullptr; } - auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]); + auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]); if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs), to_ggml_tensor_array<_OutputSize>(outputs))) { QNN_LOG_ERROR("build_graph failed\n"); @@ -542,11 +521,57 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = { static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT, "GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table"); +bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) { + switch (tensor->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) { + QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend"); + return false; + } + break; + default: + QNN_LOG_DEBUG("unsupported data type %d", tensor->type); + return false; + } + + return true; +} + +bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + GGML_UNUSED(ctx); + + auto *src0 = op->src[0]; + auto *src1 = op->src[1]; + if (src0->type != src1->type || src0->type != op->type) { + // current qnn implementation only supports the same type for src0 and src1 + QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type); + return false; + } + + if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) { + /* + * TODO: remove the blocker here when qnn backend supports mul_mat like this: + * [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n] + */ + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } + + return true; +} + } // namespace namespace qnn { -bool ggml_qnn_supports_op(const ggml_tensor *op) { +bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) { + if (op->op == GGML_OP_NONE) { + return true; + } + if (op->op == GGML_OP_UNARY) { if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) { QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op)); @@ -557,35 +582,38 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) { QNN_LOG_DEBUG("src0 is nullptr"); return false; } - } else if (op->op != GGML_OP_NONE) { + } else { if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) { QNN_LOG_DEBUG("unsupported op %d", op->op); return false; } - if (!op->src[0] || !op->src[1]) { + auto *src0 = op->src[0]; + auto *src1 = op->src[1]; + if (!src0 || !src1) { QNN_LOG_DEBUG("src0 or src1 is nullptr"); return false; } -#ifndef NDEBUG - if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) { - QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) || + !ggml_qnn_supports_tensor(ctx, op)) { return false; } -#endif - } - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - break; - default: - QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type); - return false; + switch (op->op) { + case GGML_OP_ADD: + if (!is_tensor_dimensions_equal(src0, src1)) { + QNN_LOG_DEBUG("src0 and src1 dimensions are not equal"); + return false; + } + break; + + case GGML_OP_MUL_MAT: + return ggml_qnn_supports_matmul_op(ctx, op); + + default: + return false; + } } return true; diff --git a/ggml/src/ggml-qnn/backend-ops.hpp b/ggml/src/ggml-qnn/backend-ops.hpp index 86658da118f8b..3df7f4a98a146 100644 --- a/ggml/src/ggml-qnn/backend-ops.hpp +++ b/ggml/src/ggml-qnn/backend-ops.hpp @@ -6,7 +6,7 @@ namespace qnn { -bool ggml_qnn_supports_op(const ggml_tensor *op); +bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op); bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor); } // namespace qnn diff --git a/ggml/src/ggml-qnn/backend.hpp b/ggml/src/ggml-qnn/backend.hpp index 696a883480e9f..eb292e89bfd21 100644 --- a/ggml/src/ggml-qnn/backend.hpp +++ b/ggml/src/ggml-qnn/backend.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "ggml.h" @@ -26,6 +27,7 @@ struct ggml_backend_qnn_device_context { // initialize in init qnn::qcom_socinfo socinfo = {}; + std::unordered_set supported_types; std::shared_ptr instance; std::shared_ptr qnn_interface; diff --git a/ggml/src/ggml-qnn/buffer.hpp b/ggml/src/ggml-qnn/buffer.hpp index 4b4b2daaa75b4..676e88c0454be 100644 --- a/ggml/src/ggml-qnn/buffer.hpp +++ b/ggml/src/ggml-qnn/buffer.hpp @@ -8,8 +8,8 @@ namespace qnn { class ggml_qnn_rpc_buffer { public: - ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions, - Qnn_DataType_t data_type) : + ggml_qnn_rpc_buffer(std::shared_ptr qnn_instance, const size_t size, const uint32_t rank, + uint32_t *dimensions, Qnn_DataType_t data_type) : _qnn_instance(qnn_instance), _size(size) { _qnn_rpc_buffer = static_cast(qnn_instance->alloc_rpcmem(size, alignof(void *))); diff --git a/ggml/src/ggml-qnn/graph.hpp b/ggml/src/ggml-qnn/graph.hpp index 3f1a0ef163208..858a7d3af29a2 100644 --- a/ggml/src/ggml-qnn/graph.hpp +++ b/ggml/src/ggml-qnn/graph.hpp @@ -2,7 +2,6 @@ #pragma once #include -#include #include #include #include @@ -12,19 +11,15 @@ #include "logger.hpp" #include "op-config.hpp" #include "qnn-lib.hpp" -#include "tensor.hpp" namespace qnn { -using ggml_tensor_array_t = std::vector; -using ggml_op_constructor_t = std::function(const std::string &)>; - class ggml_qnn_graph { public: explicit ggml_qnn_graph(const std::string &graph_name, QNNBackend device, std::shared_ptr qnn_instance, size_t vtcm_size_in_mb) : _graph_name(graph_name), _device(device), _qnn_instance(qnn_instance) { - QNN_LOG_INFO("graph name %s", graph_name.c_str()); + QNN_LOG_INFO("[%s]create", graph_name.c_str()); auto qnn_interface = qnn_instance->get_qnn_interface(); auto qnn_context = qnn_instance->get_qnn_context_handle(); @@ -69,19 +64,16 @@ class ggml_qnn_graph { } if (error != QNN_SUCCESS) { - QNN_LOG_INFO( - "can't create qnn graph handle with graph name %s, " - "error = %d\n", - graph_name.c_str(), error); + QNN_LOG_INFO("[%s]can't create qnn graph handle, error = %d\n", graph_name.c_str(), error); return; } - QNN_LOG_INFO("create qnn graph handle with graph name %s ok\n", graph_name.c_str()); + QNN_LOG_INFO("[%s]create succeed\n", graph_name.c_str()); _graph_handle = graph_handle; _qnn_interface = qnn_interface; } - ~ggml_qnn_graph() { QNN_LOG_DEBUG("graph name %s, destroy", _graph_name.c_str()); } + ~ggml_qnn_graph() { QNN_LOG_DEBUG("[%s]destroy", _graph_name.c_str()); } bool build_graph(ggml_op_constructor_t op_constructor, const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { @@ -91,95 +83,44 @@ class ggml_qnn_graph { return false; } - // get the max tensor rank - for (auto tensor : tensor_inputs) { - _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); - } - for (auto tensor : tensor_outputs) { - _tensor_rank = std::max(_tensor_rank, ggml_n_dims(tensor)); - } - - QNN_LOG_DEBUG("graph name %s, build_graph start", _graph_name.c_str()); - _tensor_inputs.resize(tensor_inputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, GGML_MAX_NAME, "src%d", (int)i); - auto qnn_tensor = - std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_inputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - _tensor_inputs[i] = qnn_tensor; - } - - _tensor_outputs.resize(tensor_outputs.size()); - for (size_t i = 0; i < tensor_outputs.size(); i++) { - char buffer[GGML_MAX_NAME] = {}; - snprintf(buffer, GGML_MAX_NAME, "dst%d", (int)i); - auto qnn_tensor = - std::make_shared(std::string(buffer), _device, _graph_handle, _qnn_instance); - auto *ggml_tensor = tensor_outputs[i]; - if (!qnn_tensor->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } - - _tensor_outputs[i] = qnn_tensor; + QNN_LOG_DEBUG("[%s]build_graph start", _graph_name.c_str()); + _op_config = op_constructor(_graph_name, _qnn_instance); + if (!_op_config->create_tensors(_device, _graph_handle, tensor_inputs, tensor_outputs)) { + QNN_LOG_ERROR("[%s]create_tensors failed\n", _graph_name.c_str()); + return false; } - _op_config = op_constructor(_graph_name); - _op_config->set_input_tensors(_tensor_inputs); - _op_config->set_output_tensors(_tensor_outputs); - auto error = _qnn_interface->qnn_graph_add_node(_graph_handle, _op_config->get_op_config()); - if (error != QNN_SUCCESS) { - auto *error_str = get_qnn_error_string(error); - if (error_str) { - QNN_LOG_ERROR("qnn_graph_add_node.error: %s\n", error_str); - } else { - QNN_LOG_ERROR("qnn_graph_add_node.error: %d\n", error); - } + if (!_op_config->add_op_to_graph(_graph_handle)) { + QNN_LOG_ERROR("[%s]add nodes failed\n", _graph_name.c_str()); return false; } - error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); + auto error = _qnn_interface->qnn_graph_finalize(_graph_handle, nullptr, nullptr); if (error != QNN_SUCCESS) { auto *error_str = get_qnn_error_string(error); if (error_str) { - QNN_LOG_ERROR("qnn_graph_finalize.error: %s\n", error_str); + QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %s\n", _graph_name.c_str(), error_str); } else { - QNN_LOG_ERROR("qnn_graph_finalize.error: %d\n", error); + QNN_LOG_ERROR("[%s]qnn_graph_finalize.error: %d\n", _graph_name.c_str(), error); } return false; } - QNN_LOG_DEBUG("graph name %s, build_graph succeed", _graph_name.c_str()); + QNN_LOG_DEBUG("[%s]build_graph succeed", _graph_name.c_str()); return true; } bool execute(const ggml_tensor_array_t &tensor_inputs, const ggml_tensor_array_t &tensor_outputs) { - GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); - GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - auto *ggml_tensor = tensor_inputs[i]; - if (!_tensor_inputs[i]->bind_ggml_tensor(ggml_tensor, true, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } + if (!_op_config->bind_input_tensors(tensor_inputs)) { + QNN_LOG_ERROR("[%s]bind input tensors failed\n", _graph_name.c_str()); + return false; } - for (size_t i = 0; i < tensor_outputs.size(); i++) { - auto *ggml_tensor = tensor_outputs[i]; - if (!_tensor_outputs[i]->bind_ggml_tensor(ggml_tensor, false, _tensor_rank)) { - QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); - return false; - } + if (!_op_config->bind_output_tensors(tensor_outputs)) { + QNN_LOG_ERROR("[%s]bind output tensors failed\n", _graph_name.c_str()); + return false; } - _op_config->set_input_tensors(_tensor_inputs); - _op_config->set_output_tensors(_tensor_outputs); auto &qnn_tensor_inputs = _op_config->get_qnn_input_tensors(); auto &qnn_tensor_outputs = _op_config->get_qnn_output_tensors(); @@ -188,20 +129,15 @@ class ggml_qnn_graph { qnn_tensor_outputs.data(), qnn_tensor_outputs.size(), nullptr, nullptr); if (_device == QNN_BACKEND_NPU) { if (error == QNN_COMMON_ERROR_SYSTEM_COMMUNICATION) { - QNN_LOG_WARN("NPU crashed. SSR detected. Caused QNN graph execute error\n"); + QNN_LOG_WARN("[%s]NPU crashed. SSR detected. Caused QNN graph execute error\n", _graph_name.c_str()); } } - for (auto tensor : _tensor_inputs) { - tensor->unbind_ggml_tensor(); - } - - for (auto tensor : _tensor_outputs) { - tensor->unbind_ggml_tensor(); - } + _op_config->unbind_input_tensors(); + _op_config->unbind_output_tensors(); if (error != QNN_SUCCESS) { - QNN_LOG_INFO("error = %d\n", error); + QNN_LOG_INFO("[%s]error = %d\n", _graph_name.c_str(), error); return false; } @@ -220,11 +156,8 @@ class ggml_qnn_graph { Qnn_GraphHandle_t _graph_handle = nullptr; std::shared_ptr _qnn_instance; std::shared_ptr _qnn_interface; - std::vector> _tensor_inputs; - std::vector> _tensor_outputs; std::unique_ptr _op_config; std::vector _param_types; - int _tensor_rank = 0; DISABLE_COPY(ggml_qnn_graph); DISABLE_MOVE(ggml_qnn_graph); diff --git a/ggml/src/ggml-qnn/op-config.cpp b/ggml/src/ggml-qnn/op-config.cpp new file mode 100644 index 0000000000000..07dcba156471b --- /dev/null +++ b/ggml/src/ggml-qnn/op-config.cpp @@ -0,0 +1,471 @@ +#include "op-config.hpp" + +#include + +#include "logger.hpp" + +namespace { + +constexpr const qnn::qnn_dimension_array_t kTransposeParamData[GGML_MAX_DIMS] = { + { 0 }, + { 1, 0 }, + { 0, 2, 1 }, + { 0, 1, 3, 2 }, +}; + +qnn::qnn_dimension_array_t get_transposed_dimensions(const qnn::qnn_dimension_array_t &dimensions, int rank) { + qnn::qnn_dimension_array_t transposed_dims = dimensions; + if (rank >= 2) { + transposed_dims[rank - 1] = dimensions[rank - 2]; + transposed_dims[rank - 2] = dimensions[rank - 1]; + } + + return transposed_dims; +} + +int get_rank(const qnn::ggml_tensor_array_t &tensor_inputs, const qnn::ggml_tensor_array_t &tensor_outputs) { + int tensor_rank = 0; + // get the max tensor rank + for (auto tensor : tensor_inputs) { + tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); + } + for (auto tensor : tensor_outputs) { + tensor_rank = std::max(tensor_rank, ggml_n_dims(tensor)); + } + + return tensor_rank; +} + +Qnn_DataType_t get_tensor_type(const qnn::ggml_qnn_tensor_array_t &tensors) { + Qnn_DataType_t type = QNN_DATATYPE_UNDEFINED; + for (auto tensor : tensors) { + auto tensor_type_size = qnn::qnn_datatype_size(tensor->get_data_type()); + GGML_ASSERT(tensor_type_size > 0); + if (tensor_type_size > qnn::qnn_datatype_size(type)) { + type = tensor->get_data_type(); + } + } + + return type; +} + +struct tensor_common_params { + const char *name_prefix; + int tensor_rank; + bool is_input; + QNNBackend device; + Qnn_GraphHandle_t graph_handle; + std::shared_ptr qnn_instance; +}; + +void create_tensors_from_ggml_tensor(const tensor_common_params ¶ms, const qnn::ggml_tensor_array_t &ggml_tensors, + qnn::ggml_qnn_tensor_array_t *tensor_wrappers, + std::vector *qnn_tensors) { + using namespace qnn; + + tensor_wrappers->resize(ggml_tensors.size()); + if (qnn_tensors) { + qnn_tensors->resize(ggml_tensors.size()); + } + char buffer[GGML_MAX_NAME] = {}; + auto tensor_type = params.is_input ? ggml_qnn_tensor::INPUT : ggml_qnn_tensor::OUTPUT; + for (size_t i = 0; i < ggml_tensors.size(); i++) { + snprintf(buffer, GGML_MAX_NAME, "%s%d", params.name_prefix, (int)i); + auto *ggml_tensor = ggml_tensors[i]; + (*tensor_wrappers)[i] = std::make_shared(tensor_type, std::string(buffer), ggml_tensor->ne, + ggml_tensor->type, params.tensor_rank, params.device, + params.graph_handle, params.qnn_instance); + } +} + +bool bind_tensors(const qnn::ggml_tensor_array_t &ggml_tensors, qnn::ggml_qnn_tensor_array_t &tensor_wrappers, + std::vector &qnn_tensors) { + for (size_t i = 0; i < ggml_tensors.size(); i++) { + auto *ggml_tensor = ggml_tensors[i]; + if (!tensor_wrappers[i]->bind_ggml_tensor(ggml_tensor)) { + QNN_LOG_ERROR("bind tensor %s failed\n", ggml_get_name(ggml_tensor)); + return false; + } + + qnn_tensors[i] = tensor_wrappers[i]->get_qnn_tensor(); + } + + return true; +} + +class ggml_qnn_connectable_op_config : public qnn::ggml_qnn_op_config_base { +public: + explicit ggml_qnn_connectable_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, + std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const qnn::ggml_tensor_array_t &tensor_inputs, + const qnn::ggml_tensor_array_t &tensor_outputs) override { + GGML_UNUSED(device); + GGML_UNUSED(graph_handle); + GGML_UNUSED(tensor_inputs); + GGML_UNUSED(tensor_outputs); + return true; + } + + void set_input_tensors(qnn::ggml_qnn_tensor_array_t &tensor_inputs) { + _tensor_inputs = tensor_inputs; + _qnn_tensor_inputs.resize(_tensor_inputs.size()); + } + + void set_input_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_inputs) { + _tensor_inputs = std::move(tensor_inputs); + _qnn_tensor_inputs.resize(_tensor_inputs.size()); + } + + void set_output_tensors(qnn::ggml_qnn_tensor_array_t &tensor_outputs) { + _tensor_outputs = tensor_outputs; + _qnn_tensor_outputs.resize(_tensor_outputs.size()); + } + + void set_output_tensors(qnn::ggml_qnn_tensor_array_t &&tensor_outputs) { + _tensor_outputs = std::move(tensor_outputs); + _qnn_tensor_outputs.resize(_tensor_outputs.size()); + } + + qnn::ggml_qnn_tensor_array_t &get_input_tensors() { return _tensor_inputs; } + qnn::ggml_qnn_tensor_array_t &get_output_tensors() { return _tensor_outputs; } + +private: + DISABLE_COPY(ggml_qnn_connectable_op_config); + DISABLE_MOVE(ggml_qnn_connectable_op_config); +}; + +} // namespace + +namespace qnn { + +void ggml_qnn_op_config_base::add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_SCALAR; + param.name = _param_names.back().c_str(); + param.scalarParam = scalar; + _qnn_parameters.push_back(param); +} + +bool ggml_qnn_op_config_base::add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, + int rank, const uint8_t *data, const Qnn_DataType_t data_type, + QNNBackend device, Qnn_GraphHandle_t graph_handle) { + std::string tensor_name = _name + name + std::to_string(_tensor_parameters.size()); + auto param_tensor = std::make_shared(ggml_qnn_tensor::PARAMETER, tensor_name, dimensions, + data_type, rank, device, graph_handle, _qnn_instance); + size_t data_size = ggml_type_size(ggml_datatype_from_qnn_datatype(data_type)); + for (int i = 0; i < rank; i++) { + data_size *= dimensions[i]; + } + + GGML_ASSERT(data_size > 0); + if (!param_tensor->bind_buffer(const_cast(data), data_size)) { + QNN_LOG_ERROR("parameter tensor bind_buffer failed\n"); + return false; + } + + if (!param_tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("parameter tensor alloc_qnn_tensor_id failed\n"); + return false; + } + + _tensor_parameters.push_back(param_tensor); + _param_names.push_back(name); + Qnn_Param_t param = QNN_PARAM_INIT; + param.paramType = QNN_PARAMTYPE_TENSOR; + param.name = _param_names.back().c_str(); + param.tensorParam = param_tensor->get_qnn_tensor(); + _qnn_parameters.push_back(param); + return true; +} + +bool ggml_qnn_op_config_base::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + GGML_ASSERT(_qnn_tensor_inputs.size() == _tensor_inputs.size()); + GGML_ASSERT(_qnn_tensor_outputs.size() == _tensor_outputs.size()); + + auto qnn_interface = _qnn_instance->get_qnn_interface(); + for (size_t i = 0; i < _tensor_inputs.size(); i++) { + auto tensor = _tensor_inputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]input tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + + _qnn_tensor_inputs[i] = tensor->get_qnn_tensor(); + } + + for (size_t i = 0; i < _tensor_outputs.size(); i++) { + auto tensor = _tensor_outputs[i]; + if (!tensor->alloc_qnn_tensor_id()) { + QNN_LOG_ERROR("[%s]output tensor alloc_qnn_tensor_id failed\n", _name.c_str()); + return false; + } + _qnn_tensor_outputs[i] = _tensor_outputs[i]->get_qnn_tensor(); + } + + auto error = qnn_interface->qnn_graph_add_node(graph_handle, get_op_config()); + if (error != QNN_SUCCESS) { + auto *error_str = get_qnn_error_string(error); + if (error_str) { + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %s\n", _name.c_str(), error_str); + } else { + QNN_LOG_ERROR("[%s]qnn_graph_add_node.error: %d\n", _name.c_str(), error); + } + return false; + } + + QNN_LOG_DEBUG("[%s]added to graph\n", _name.c_str()); + return true; +} + +bool ggml_qnn_op_config_base::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + GGML_ASSERT(tensor_inputs.size() == _tensor_inputs.size()); + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_op_config_base::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_outputs.size() == _tensor_outputs.size()); + return bind_tensors(tensor_outputs, _tensor_outputs, _qnn_tensor_outputs); +} + +void ggml_qnn_op_config_base::unbind_input_tensors() { + for (auto &tensor : _tensor_inputs) { + tensor->unbind(); + } +} + +void ggml_qnn_op_config_base::unbind_output_tensors() { + for (auto &tensor : _tensor_outputs) { + tensor->unbind(); + } +} + +Qnn_OpConfig_t ggml_qnn_op_config_base::get_op_config() { + Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; + config.version = QNN_OPCONFIG_VERSION_1; + auto &op_config = config.v1; + op_config.name = _name.c_str(); + op_config.packageName = _package_name.c_str(); + op_config.typeName = _op_type.c_str(); + op_config.numOfParams = (uint32_t)_qnn_parameters.size(); + op_config.params = _qnn_parameters.data(); + op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); + op_config.inputTensors = _qnn_tensor_inputs.data(); + op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); + op_config.outputTensors = _qnn_tensor_outputs.data(); + return config; +} + +bool ggml_qnn_single_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); + tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); + params.name_prefix = "dst"; + params.is_input = false; + create_tensors_from_ggml_tensor(params, tensor_outputs, &_tensor_outputs, &_qnn_tensor_outputs); + return true; +} + +bool ggml_qnn_matmul_op_config::create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) { + GGML_ASSERT(tensor_inputs.size() == 2); + GGML_ASSERT(tensor_outputs.size() == 1); + const auto tensor_rank = get_rank(tensor_inputs, tensor_outputs); + GGML_ASSERT(tensor_rank >= 2); + + // create input tensors + tensor_common_params params = { "src", tensor_rank, true, device, graph_handle, _qnn_instance }; + create_tensors_from_ggml_tensor(params, tensor_inputs, &_tensor_inputs, &_qnn_tensor_inputs); + + // create output tensor + ggml_qnn_tensor_array_t mat_mul_tensor_outputs; + params.name_prefix = "dst"; + params.is_input = false; + create_tensors_from_ggml_tensor(params, tensor_outputs, &mat_mul_tensor_outputs, nullptr); + + // create mat_mul nodes + return create_mat_mul_nodes(device, graph_handle, tensor_rank, _tensor_inputs, mat_mul_tensor_outputs); +} + +bool ggml_qnn_matmul_op_config::create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + ggml_qnn_tensor_array_t &tensor_inputs, + ggml_qnn_tensor_array_t &tensor_outputs) { + + /* + * First, both the ggml and qnn tensor in memory are stored as row-major format. (For more details, please also: + * https://pytorch.org/blog/tensor-memory-format-matters/#:~:text=Column%20Major%20Order:%20In%20this%20format,%20the%20matrix) + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + * + * Second, from the ggml introduction here: https://github.com/huggingface/blog/blob/main/introduction-to-ggml.md + * Given 2 matrices A and B, the matrix multiplication C = A * B is defined as: + * ```python + * import torch + * # Create two matrices + * A = torch.tensor([ + * [2, 8], + * [5, 1], + * [4, 2], + * [8, 6], + * ]) + * B = torch.tensor([ + * [10, 5], + * [9, 9], + * [5, 4], + * ]) + * # Perform matrix multiplication + * result = torch.matmul(A, B.T) + * print(result.T) + * ``` + * Here, the B.T is the transpose of B. + * + * So here we need to create graph like: + * ```mermaid + * graph TD; + * i1>ggml_tensor_in0] --src0--> mat_mul0; + * i2>ggml_tensor_in1] --src1--> transpose0; + * transpose0 --src0_trans--> mat_mul0; + * mat_mul0 --dst_trans--> transpose1; + * transpose1 --dst0--> o1>ggml_tensor_out]; + * ``` + */ + + // create src0_trans tensor + auto src1 = tensor_inputs.back(); + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS does not match the expected value"); + + qnn_dimension_array_t dimensions = get_transposed_dimensions(src1->get_dimensions(), rank); + auto src0_trans = + std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "src0_trans", dimensions, + src1->get_data_type(), rank, device, graph_handle, _qnn_instance); + + // create dst_trans tensor + auto dst = tensor_outputs.front(); + dimensions = get_transposed_dimensions(dst->get_dimensions(), rank); + auto dst_trans = std::make_shared(ggml_qnn_tensor::INTERMEDIATE, "dst_trans", dimensions, + dst->get_data_type(), rank, device, graph_handle, _qnn_instance); + + // create transpose0 + auto transpose0 = std::make_shared(_name + "_trans0", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); + + // create transpose1 + auto transpose1 = std::make_shared(_name + "_trans1", QNN_OP_PACKAGE_NAME_QTI_AISW, + QNN_OP_TRANSPOSE, _qnn_instance); + + // create mat_mul + auto mat_mul = std::make_shared(_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, + _qnn_instance); + + // set transpose0 parameters + auto *params_data = reinterpret_cast(kTransposeParamData[rank - 1].data()); + const qnn_dimension_array_t param_dims = { (uint32_t)rank, 1, 1, 1 }; + transpose0->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, + graph_handle); + + // set transpose1 parameters + transpose1->add_tensor_param(QNN_OP_TRANSPOSE_PARAM_PERM, param_dims, 1, params_data, QNN_DATATYPE_UINT_32, device, + graph_handle); + + // set tensor to transpose0 + ggml_qnn_tensor_array_t tensors = { tensor_inputs.back() }; + transpose0->set_input_tensors(tensors); + tensors = { src0_trans }; + transpose0->set_output_tensors(tensors); + + // set tensor to mat_mul + tensors = { tensor_inputs.front(), src0_trans }; + mat_mul->set_input_tensors(tensors); + tensors = { dst_trans }; + mat_mul->set_output_tensors(tensors); + + // set tensor to transpose1 + tensors = { dst_trans }; + transpose1->set_input_tensors(tensors); + transpose1->set_output_tensors(tensor_outputs); + + _mat_mul = mat_mul; + _transpose0 = transpose0; + _transpose1 = transpose1; + return true; +} + +bool ggml_qnn_matmul_op_config::add_op_to_graph(Qnn_GraphHandle_t graph_handle) { + for (auto &convert : _input_converts) { + if (convert && !convert->add_op_to_graph(graph_handle)) { + return false; + } + } + + return _transpose0->add_op_to_graph(graph_handle) && _mat_mul->add_op_to_graph(graph_handle) && + _transpose1->add_op_to_graph(graph_handle) && + (!_output_convert || _output_convert->add_op_to_graph(graph_handle)); +} + +bool ggml_qnn_matmul_op_config::bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) { + return bind_tensors(tensor_inputs, _tensor_inputs, _qnn_tensor_inputs); +} + +bool ggml_qnn_matmul_op_config::bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) { + if (_output_convert) { + return _output_convert->bind_output_tensors(tensor_outputs); + } else { + return _transpose1->bind_output_tensors(tensor_outputs); + } +} + +void ggml_qnn_matmul_op_config::unbind_input_tensors() { + _mat_mul->unbind_input_tensors(); + _transpose0->unbind_input_tensors(); + for (auto &convert : _input_converts) { + if (convert) { + convert->unbind_input_tensors(); + } + } +} + +void ggml_qnn_matmul_op_config::unbind_output_tensors() { + _transpose1->unbind_output_tensors(); + if (_output_convert) { + _output_convert->unbind_output_tensors(); + } +} + +std::vector &ggml_qnn_matmul_op_config::get_qnn_output_tensors() { + if (_output_convert) { + return _output_convert->get_qnn_output_tensors(); + } else { + return _transpose1->get_qnn_output_tensors(); + } +} + +ggml_op_constructor_t create_op_constructor(const std::string &op_name) { + if (op_name == QNN_OP_MAT_MUL) { + // For QNN_OP_MAT_MUL, we need to transpose the input tensor + return [](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + QNN_LOG_DEBUG("create QNN_OP_MAT_MUL, name %s\n", instance_name.c_str()); + return std::make_unique(instance_name, qnn_instance); + }; + } + + return [op_name](const std::string &instance_name, + std::shared_ptr qnn_instance) -> std::unique_ptr { + return std::make_unique(instance_name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name, + qnn_instance); + }; +} + +} // namespace qnn diff --git a/ggml/src/ggml-qnn/op-config.hpp b/ggml/src/ggml-qnn/op-config.hpp index 7852ee84dc12f..2016cb4ac994d 100644 --- a/ggml/src/ggml-qnn/op-config.hpp +++ b/ggml/src/ggml-qnn/op-config.hpp @@ -1,73 +1,122 @@ #pragma once +#include +#include #include #include #include "ggml-qnn.h" -#include "logger.hpp" #include "qnn-lib.hpp" #include "qnn-types.hpp" #include "tensor.hpp" namespace qnn { + +using ggml_tensor_array_t = std::vector; + class ggml_qnn_op_config { public: - explicit ggml_qnn_op_config(const std::string &name, const std::string &package_name, const std::string &op_type) : - _name(name), _package_name(package_name), _op_type(op_type) {} - - void set_input_tensors(const std::vector> &tensor_inputs) { - _qnn_tensor_inputs.resize(tensor_inputs.size()); - for (size_t i = 0; i < tensor_inputs.size(); i++) { - _qnn_tensor_inputs[i] = tensor_inputs[i]->get_qnn_tensor(); - } - } - - void set_output_tensors(const std::vector> &tensor_outputs) { - _qnn_tensor_outputs.resize(tensor_outputs.size()); - for (size_t i = 0; i < tensor_outputs.size(); i++) { - _qnn_tensor_outputs[i] = tensor_outputs[i]->get_qnn_tensor(); - } - } - - void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar) { - _param_names.push_back(name); - Qnn_Param_t param = QNN_PARAM_INIT; - param.paramType = QNN_PARAMTYPE_SCALAR; - param.name = _param_names.back().c_str(); - param.scalarParam = scalar; - _parameters.push_back(param); - } - - std::vector &get_qnn_input_tensors() { return _qnn_tensor_inputs; } - std::vector &get_qnn_output_tensors() { return _qnn_tensor_outputs; } - - Qnn_OpConfig_t get_op_config() { - Qnn_OpConfig_t config = QNN_OPCONFIG_INIT; - config.version = QNN_OPCONFIG_VERSION_1; - auto &op_config = config.v1; - op_config.name = _name.c_str(); - op_config.packageName = _package_name.c_str(); - op_config.typeName = _op_type.c_str(); - op_config.numOfParams = (uint32_t)_parameters.size(); - op_config.params = _parameters.data(); - op_config.numOfInputs = (uint32_t)_qnn_tensor_inputs.size(); - op_config.inputTensors = _qnn_tensor_inputs.data(); - op_config.numOfOutputs = (uint32_t)_qnn_tensor_outputs.size(); - op_config.outputTensors = _qnn_tensor_outputs.data(); - return config; - } + virtual ~ggml_qnn_op_config() {} + virtual bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, + const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) = 0; + virtual std::vector &get_qnn_input_tensors() = 0; + virtual std::vector &get_qnn_output_tensors() = 0; + virtual bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) = 0; + virtual bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) = 0; + virtual bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) = 0; + virtual void unbind_input_tensors() = 0; + virtual void unbind_output_tensors() = 0; +}; + +class ggml_qnn_op_config_base : public ggml_qnn_op_config { +public: + explicit ggml_qnn_op_config_base(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) : + _name(name), _package_name(package_name), _op_type(op_type), _qnn_instance(qnn_instance) {} + + void add_scalar_param(const std::string &name, const Qnn_Scalar_t scalar); + bool add_tensor_param(const std::string &name, const qnn_dimension_array_t &dimensions, int rank, + const uint8_t *data, const Qnn_DataType_t data_type, QNNBackend device, + Qnn_GraphHandle_t graph_handle); + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override { return _qnn_tensor_outputs; } + +protected: + Qnn_OpConfig_t get_op_config(); -private: std::string _name; std::string _package_name; std::string _op_type; + std::shared_ptr _qnn_instance; + ggml_qnn_tensor_array_t _tensor_inputs; + ggml_qnn_tensor_array_t _tensor_outputs; + ggml_qnn_tensor_array_t _tensor_parameters; std::vector _qnn_tensor_inputs; std::vector _qnn_tensor_outputs; - std::vector _parameters; + std::vector _qnn_parameters; std::vector _param_names; - DISABLE_COPY(ggml_qnn_op_config); - DISABLE_MOVE(ggml_qnn_op_config); + DISABLE_COPY(ggml_qnn_op_config_base); + DISABLE_MOVE(ggml_qnn_op_config_base); +}; + +class ggml_qnn_single_op_config : public ggml_qnn_op_config_base { +public: + explicit ggml_qnn_single_op_config(const std::string &name, const std::string &package_name, + const std::string &op_type, std::shared_ptr qnn_instance) : + ggml_qnn_op_config_base(name, package_name, op_type, qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; + +private: + DISABLE_COPY(ggml_qnn_single_op_config); + DISABLE_MOVE(ggml_qnn_single_op_config); }; + +class ggml_qnn_matmul_op_config : public ggml_qnn_op_config { +public: + ggml_qnn_matmul_op_config(const std::string &name, std::shared_ptr qnn_instance) : + _name(name), _qnn_instance(qnn_instance) {} + + bool create_tensors(QNNBackend device, Qnn_GraphHandle_t graph_handle, const ggml_tensor_array_t &tensor_inputs, + const ggml_tensor_array_t &tensor_outputs) override; + bool add_op_to_graph(Qnn_GraphHandle_t graph_handle) override; + bool bind_input_tensors(const ggml_tensor_array_t &tensor_inputs) override; + bool bind_output_tensors(const ggml_tensor_array_t &tensor_outputs) override; + void unbind_input_tensors() override; + void unbind_output_tensors() override; + std::vector &get_qnn_input_tensors() override { return _qnn_tensor_inputs; } + std::vector &get_qnn_output_tensors() override; + +private: + bool create_mat_mul_nodes(QNNBackend device, Qnn_GraphHandle_t graph_handle, const int rank, + ggml_qnn_tensor_array_t &tensor_inputs, ggml_qnn_tensor_array_t &tensor_outputs); + + std::string _name; + std::shared_ptr _qnn_instance; + std::shared_ptr _transpose0; + std::shared_ptr _transpose1; + std::shared_ptr _mat_mul; + std::vector> _input_converts; + std::shared_ptr _output_convert; + ggml_qnn_tensor_array_t _tensor_inputs; + std::vector _qnn_tensor_inputs; + + DISABLE_COPY(ggml_qnn_matmul_op_config); + DISABLE_MOVE(ggml_qnn_matmul_op_config); +}; + +using ggml_op_constructor_t = + std::function(const std::string &, std::shared_ptr)>; + +ggml_op_constructor_t create_op_constructor(const std::string &op_name); + } // namespace qnn diff --git a/ggml/src/ggml-qnn/qnn-lib.hpp b/ggml/src/ggml-qnn/qnn-lib.hpp index d55f730f80d84..74bc2b3f95f6b 100644 --- a/ggml/src/ggml-qnn/qnn-lib.hpp +++ b/ggml/src/ggml-qnn/qnn-lib.hpp @@ -637,7 +637,7 @@ class qnn_instance { return mem_fd; } - Qnn_MemHandle_t register_rpcmem(void *p_data, uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { + Qnn_MemHandle_t register_rpcmem(void *p_data, const uint32_t rank, uint32_t *dimensions, Qnn_DataType_t data_type) { if (!p_data) { QNN_LOG_WARN("invalid param\n"); return nullptr; diff --git a/ggml/src/ggml-qnn/tensor.hpp b/ggml/src/ggml-qnn/tensor.hpp index c465d17f25506..faf5b0df5f4e1 100644 --- a/ggml/src/ggml-qnn/tensor.hpp +++ b/ggml/src/ggml-qnn/tensor.hpp @@ -1,8 +1,10 @@ #pragma once +#include #include #include +#include #include #include #include @@ -16,55 +18,81 @@ namespace qnn { +static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + class ggml_qnn_tensor { public: - explicit ggml_qnn_tensor(const std::string &name, QNNBackend device, Qnn_GraphHandle_t graph_handle, + typedef enum _tensor_type { INPUT, OUTPUT, INTERMEDIATE, PARAMETER } tensor_type_t; + + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, + const qnn_dimension_array_t &dimensions, Qnn_DataType_t data_type, int rank, + QNNBackend device, Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : _tensor_name(name), _device(device), _qnn_instance(qnn_instance), _graph_handle(graph_handle) { - QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + if (!_tensor_name.empty()) { + QNN_TENSOR_SET_NAME(_qnn_tensor, _tensor_name.c_str()); + } QNN_TENSOR_SET_DIMENSIONS(_qnn_tensor, _dimensions.data()); - QNN_TENSOR_SET_TYPE(_qnn_tensor, QNN_TENSOR_TYPE_NATIVE); QNN_TENSOR_SET_DATA_FORMAT(_qnn_tensor, QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER); - QNN_LOG_DEBUG("create tensor %s, device: %d", _tensor_name.c_str(), device); + + _dimensions = dimensions; + update_params_from_ggml_tensor(tensor_type, data_type, rank); + QNN_LOG_DEBUG("create tensor %s, rank: %d, dims: [%d, %d, %d, %d], data_type: %d, device: %d", + _tensor_name.c_str(), rank, (int)_dimensions[0], (int)_dimensions[1], (int)_dimensions[2], + (int)_dimensions[3], (int)data_type, (int)device); } + explicit ggml_qnn_tensor(tensor_type_t tensor_type, const std::string &name, + const ggml_dimension_array_t &dimensions, ggml_type data_type, int rank, QNNBackend device, + Qnn_GraphHandle_t graph_handle, std::shared_ptr qnn_instance) : + ggml_qnn_tensor(tensor_type, name, get_internal_dimension(dimensions, rank), + qnn_datatype_from_ggml_datatype(data_type), rank, device, graph_handle, qnn_instance) {} + ~ggml_qnn_tensor() { _qnn_rpc_buffer.reset(); } - bool bind_ggml_tensor(ggml_tensor *tensor, bool is_input, int prev_max_rank) { - if (_tensor) { - if (_tensor != tensor) { - QNN_LOG_WARN("tensor %s has been bound to another ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); - return false; - } - QNN_LOG_INFO("tensor %s already bound to same ggml tensor %s", _tensor_name.c_str(), - ggml_get_name(_tensor)); + bool alloc_qnn_tensor_id() { + if (QNN_TENSOR_GET_ID(_qnn_tensor)) { + QNN_LOG_WARN("graph tensor %s already created, id %d", _tensor_name.c_str(), + QNN_TENSOR_GET_ID(_qnn_tensor)); return true; } - update_params_from_ggml_tensor(tensor, prev_max_rank); - Qnn_TensorType_t new_tensor_type = is_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_APP_READ; - QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); - QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); + Qnn_Tensor_t qnn_tensor = _qnn_tensor; + auto qnn_interface = _qnn_instance->get_qnn_interface(); + auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); + if (error != QNN_SUCCESS) { + QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return false; + } + + QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); + QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), QNN_TENSOR_GET_ID(qnn_tensor), + QNN_TENSOR_GET_RANK(qnn_tensor)); - if (!QNN_TENSOR_GET_ID(_qnn_tensor)) { - Qnn_Tensor_t qnn_tensor = _qnn_tensor; - auto qnn_interface = _qnn_instance->get_qnn_interface(); - auto error = qnn_interface->qnn_tensor_create_graph_tensor(_graph_handle, &qnn_tensor); - if (error != QNN_SUCCESS) { - QNN_LOG_WARN("create graph tensor failed, tensor %s, error: %d\n", _tensor_name.c_str(), error); + return true; + } + + bool bind_buffer(uint8_t *buffer, const size_t buffer_size) { + if (_buffer) { + if (_buffer != buffer) { + QNN_LOG_WARN("tensor %s has been bound to another buffer %p", _tensor_name.c_str(), _buffer); return false; } - QNN_TENSOR_SET_ID(_qnn_tensor, QNN_TENSOR_GET_ID(qnn_tensor)); - QNN_LOG_DEBUG("create graph tensor %s, id: %d, rank: %d", _tensor_name.c_str(), - QNN_TENSOR_GET_ID(qnn_tensor), QNN_TENSOR_GET_RANK(qnn_tensor)); + QNN_LOG_INFO("tensor %s already bound to same ggml tensor %p", _tensor_name.c_str(), _buffer); + return true; + } + + if (QNN_TENSOR_GET_TYPE(_qnn_tensor) == QNN_TENSOR_TYPE_NATIVE) { + QNN_LOG_DEBUG("tensor %s type(%d) not READ/WRITE, skipping", _tensor_name.c_str(), + (int)QNN_TENSOR_TYPE_NATIVE); + return true; } if (should_use_mem_handle()) { if (!_qnn_rpc_buffer) { auto qnn_rpc_buffer = std::make_unique( - _qnn_instance, ggml_nbytes(tensor), QNN_TENSOR_GET_RANK(_qnn_tensor), + _qnn_instance, buffer_size, QNN_TENSOR_GET_RANK(_qnn_tensor), QNN_TENSOR_GET_DIMENSIONS(_qnn_tensor), QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor)); if (!qnn_rpc_buffer->is_valid()) { QNN_LOG_WARN("alloc rpc mem failed, tensor %s", _tensor_name.c_str()); @@ -79,30 +107,41 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, use mem handle %p", _tensor_name.c_str(), QNN_TENSOR_GET_MEM_HANDLE(_qnn_tensor)); } else { QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); - Qnn_ClientBuffer_t client_buf = { tensor->data, get_ggml_tensor_data_size(tensor) }; + Qnn_ClientBuffer_t client_buf = { buffer, (uint32_t)buffer_size }; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); QNN_LOG_DEBUG("tensor %s, use client buffer %p size %d", _tensor_name.c_str(), client_buf.data, (int)client_buf.dataSize); } - _tensor = tensor; + _buffer = buffer; + _buffer_size = buffer_size; if (!write_to_qnn_tensor()) { QNN_LOG_WARN("write to qnn tensor failed, tensor %s", _tensor_name.c_str()); return false; } - QNN_LOG_DEBUG("bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); + QNN_LOG_DEBUG("bind tensor %s to buffer: %p, size: %d", _tensor_name.c_str(), buffer, (int)buffer_size); + return true; + } + + bool bind_ggml_tensor(ggml_tensor *tensor) { + if (!bind_buffer(reinterpret_cast(tensor->data), ggml_nbytes(tensor))) { + QNN_LOG_WARN("Failed to bind tensor: %s to ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(tensor)); + return false; + } + + QNN_LOG_DEBUG("Bind tensor %s to ggml tensor %s", _tensor_name.c_str(), ggml_get_name(tensor)); return true; } - bool unbind_ggml_tensor() { + bool unbind() { if (!_graph_handle) { QNN_LOG_WARN("tensor %s not bound to any graph", _tensor_name.c_str()); return false; } - if (!_tensor) { + if (!_buffer) { QNN_LOG_DEBUG("tensor %s not bound to ggml tensor", _tensor_name.c_str()); return true; } @@ -119,12 +158,15 @@ class ggml_qnn_tensor { QNN_LOG_DEBUG("tensor %s, clear client buffer", _tensor_name.c_str()); } - QNN_LOG_DEBUG("unbind tensor: %s from ggml tensor: %s", _tensor_name.c_str(), ggml_get_name(_tensor)); - _tensor = nullptr; + QNN_LOG_DEBUG("unbind tensor: %s from buffer: %p, size: %d", _tensor_name.c_str(), _buffer, (int)_buffer_size); + _buffer = nullptr; + _buffer_size = 0; return true; } const Qnn_Tensor_t &get_qnn_tensor() const { return _qnn_tensor; } + Qnn_DataType_t get_data_type() const { return QNN_TENSOR_GET_DATA_TYPE(_qnn_tensor); } + const qnn_dimension_array_t &get_dimensions() const { return _dimensions; } private: bool write_to_qnn_tensor() { @@ -136,7 +178,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_qnn_rpc_buffer->get_buffer(), _tensor->data, ggml_nbytes(_tensor)); + memcpy(_qnn_rpc_buffer->get_buffer(), _buffer, _buffer_size); } else { QNN_LOG_WARN("tensor %s: can't find rpcmem from qnn mem handle\n", _tensor_name.c_str()); return false; @@ -157,7 +199,7 @@ class ggml_qnn_tensor { if (should_use_mem_handle()) { if (_qnn_rpc_buffer) { - memcpy(_tensor->data, _qnn_rpc_buffer->get_buffer(), ggml_nbytes(_tensor)); + memcpy(_buffer, _qnn_rpc_buffer->get_buffer(), _buffer_size); } else { QNN_LOG_WARN("can't find rpcmem from qnn mem handle\n"); return false; @@ -169,29 +211,45 @@ class ggml_qnn_tensor { return true; } - void update_params_from_ggml_tensor(ggml_tensor *tensor, int prev_max_rank) { - _dimensions[0] = (uint32_t)tensor->ne[0]; - _dimensions[1] = (uint32_t)tensor->ne[1]; - _dimensions[2] = (uint32_t)tensor->ne[2]; - _dimensions[3] = (uint32_t)tensor->ne[3]; - QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, device_datatype_from_ggml_datatype(tensor->type)); + void update_params_from_ggml_tensor(tensor_type_t tensor_type, Qnn_DataType_t data_type, int rank) { + QNN_TENSOR_SET_DATA_TYPE(_qnn_tensor, data_type); // TODO: set the quantizeParams base on the tensor type - QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)std::max(prev_max_rank, ggml_n_dims(tensor))); - + QNN_TENSOR_SET_RANK(_qnn_tensor, (uint32_t)rank); QNN_TENSOR_SET_MEM_TYPE(_qnn_tensor, QNN_TENSORMEMTYPE_RAW); Qnn_ClientBuffer_t client_buf = {}; QNN_TENSOR_SET_CLIENT_BUF(_qnn_tensor, client_buf); + + Qnn_TensorType_t new_tensor_type; + switch (tensor_type) { + case INPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; + break; + case OUTPUT: + new_tensor_type = QNN_TENSOR_TYPE_APP_READ; + break; + case PARAMETER: + new_tensor_type = QNN_TENSOR_TYPE_STATIC; + break; + default: + new_tensor_type = QNN_TENSOR_TYPE_NATIVE; + break; + } + QNN_TENSOR_SET_TYPE(_qnn_tensor, new_tensor_type); + QNN_LOG_INFO("tensor %s changed to type %d", _tensor_name.c_str(), new_tensor_type); } - bool should_use_mem_handle() const { return _device == QNN_BACKEND_NPU; } + bool should_use_mem_handle() const { + return _device == QNN_BACKEND_NPU && QNN_TENSOR_GET_TYPE(_qnn_tensor) != QNN_TENSOR_TYPE_STATIC; + } std::string _tensor_name; - const ggml_tensor *_tensor; + uint8_t *_buffer = nullptr; + size_t _buffer_size = 0; QNNBackend _device; std::shared_ptr _qnn_instance; Qnn_Tensor_t _qnn_tensor = qnn_tensor_init(kDefaultQnnTensorVersion); - std::array _dimensions = {}; + qnn_dimension_array_t _dimensions = {}; Qnn_GraphHandle_t _graph_handle = nullptr; std::unique_ptr _qnn_rpc_buffer; @@ -199,4 +257,6 @@ class ggml_qnn_tensor { DISABLE_MOVE(ggml_qnn_tensor); }; +using ggml_qnn_tensor_array_t = std::vector>; + } // namespace qnn diff --git a/ggml/src/ggml-qnn/utils.cpp b/ggml/src/ggml-qnn/utils.cpp index e44d6dbccee42..0de9d203ebee9 100644 --- a/ggml/src/ggml-qnn/utils.cpp +++ b/ggml/src/ggml-qnn/utils.cpp @@ -9,14 +9,40 @@ namespace qnn { +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS should be 4"); + GGML_ASSERT(rank <= GGML_MAX_DIMS && rank > 0); + + qnn_dimension_array_t internal_dims = {}; + /* + * Both the ggml and qnn tensor in memory are stored as row-major format. + * But the dimensions of the tensor are stored in different order. + * For example, a 2x3 matrix: + * [ + * [1, 2, 3], + * [4, 5, 6], + * ] + * The ggml tensor will have dimensions [3, 2], while the qnn tensor will have dimensions [2, 3]. + */ + for (uint32_t i = 0; i < rank; i++) { + internal_dims[i] = std::max(dims[rank - 1 - i], 1); + } + + return internal_dims; +} + // TODO: mapping more ggml data type to QNN data type // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684 -Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type) { switch (ggml_type) { - case GGML_TYPE_F16: - return QNN_DATATYPE_FLOAT_16; case GGML_TYPE_F32: return QNN_DATATYPE_FLOAT_32; + case GGML_TYPE_F16: + return QNN_DATATYPE_FLOAT_16; + case GGML_TYPE_I32: + return QNN_DATATYPE_INT_32; + case GGML_TYPE_I16: + return QNN_DATATYPE_INT_16; case GGML_TYPE_I8: return QNN_DATATYPE_INT_8; case GGML_TYPE_Q8_0: @@ -29,16 +55,75 @@ Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type) { return QNN_DATATYPE_UNDEFINED; } -Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor) { - Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_NATIVE; +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return GGML_TYPE_F32; + case QNN_DATATYPE_FLOAT_16: + return GGML_TYPE_F16; + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return GGML_TYPE_I32; + case QNN_DATATYPE_INT_16: + return GGML_TYPE_I16; + case QNN_DATATYPE_INT_8: + return GGML_TYPE_I8; + case QNN_DATATYPE_SFIXED_POINT_8: + return GGML_TYPE_Q8_0; + case QNN_DATATYPE_SFIXED_POINT_4: + return GGML_TYPE_Q4_0; + default: + break; + } + return GGML_TYPE_COUNT; +} - if (ggml_tensor->flags & GGML_TENSOR_FLAG_INPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE; - } else if (ggml_tensor->flags & GGML_TENSOR_FLAG_OUTPUT) { - qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ; +size_t qnn_datatype_size(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return sizeof(float); + case QNN_DATATYPE_FLOAT_16: + return sizeof(uint16_t); + case QNN_DATATYPE_UINT_32: + case QNN_DATATYPE_INT_32: + return sizeof(int32_t); + case QNN_DATATYPE_INT_16: + return sizeof(int16_t); + case QNN_DATATYPE_INT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_8: + return sizeof(int8_t); + case QNN_DATATYPE_SFIXED_POINT_4: + return sizeof(int8_t); + default: + break; } + return 0; +} - return qnn_tensor_type; +const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type) { + switch (qnn_type) { + case QNN_DATATYPE_FLOAT_32: + return "QNN_DATATYPE_FLOAT_32"; + case QNN_DATATYPE_FLOAT_16: + return "QNN_DATATYPE_FLOAT_16"; + case QNN_DATATYPE_UINT_32: + return "QNN_DATATYPE_UINT_32"; + case QNN_DATATYPE_INT_32: + return "QNN_DATATYPE_INT_32"; + case QNN_DATATYPE_INT_16: + return "QNN_DATATYPE_INT_16"; + case QNN_DATATYPE_INT_8: + return "QNN_DATATYPE_INT_8"; + case QNN_DATATYPE_SFIXED_POINT_8: + return "QNN_DATATYPE_SFIXED_POINT_8"; + case QNN_DATATYPE_SFIXED_POINT_4: + return "QNN_DATATYPE_SFIXED_POINT_4"; + default: + break; + } + + return "QNN_DATATYPE_UNDEFINED"; } uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { @@ -51,8 +136,13 @@ uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor) { return rank; } -const char *get_backend_name(int n_backend_type) { - switch (n_backend_type) { +const char *get_ggml_type_name(ggml_type type) { + const auto *traits = ggml_get_type_traits(type); + return traits->type_name; +} + +const char *get_backend_name(size_t device_index) { + switch (device_index) { case QNN_BACKEND_CPU: return "QNN-CPU"; case QNN_BACKEND_GPU: diff --git a/ggml/src/ggml-qnn/utils.hpp b/ggml/src/ggml-qnn/utils.hpp index b7f29bdaa5663..2c58d037982f6 100644 --- a/ggml/src/ggml-qnn/utils.hpp +++ b/ggml/src/ggml-qnn/utils.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include "ggml.h" @@ -17,8 +18,14 @@ namespace qnn { +using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS]; +using qnn_dimension_array_t = std::array; + +qnn_dimension_array_t get_internal_dimension(const ggml_dimension_array_t &dims, uint32_t rank); + uint32_t get_ggml_tensor_rank(const ggml_tensor *tensor); -const char *get_backend_name(int n_backend_type); +const char *get_ggml_type_name(ggml_type type); +const char *get_backend_name(size_t device_index); const char *get_chipset_desc(uint32_t chipset_id); const char *get_htparch_desc(size_t htp_arch); intptr_t align_to(size_t alignment, intptr_t offset); @@ -187,8 +194,10 @@ inline void set_qnn_tensor_dyn_dimensions(Qnn_Tensor_t &tensor, uint8_t *isDynam } } -Qnn_DataType_t device_datatype_from_ggml_datatype(ggml_type ggml_type); -Qnn_TensorType_t device_tensortype_from_ggml_tensor(ggml_tensor *ggml_tensor); +Qnn_DataType_t qnn_datatype_from_ggml_datatype(ggml_type ggml_type); +ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type); +size_t qnn_datatype_size(Qnn_DataType_t qnn_type); +const char *qnn_datatype_to_string(Qnn_DataType_t qnn_type); #if ENABLE_QNNBACKEND_PERF class qnn_perf {