Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
832c29b
ggml_qnn_op_config now manager the construction of ggml_qnn_tensor
chraac Sep 7, 2024
0c428d9
wip
chraac Sep 9, 2024
ef763f4
add interface ggml_qnn_op_config
chraac Sep 9, 2024
94499ad
add ggml_qnn_list_op_config
chraac Sep 9, 2024
2f661df
add create_tensor and move tensor bind to execute
chraac Sep 10, 2024
d8caaec
wip
chraac Sep 10, 2024
cff20aa
rename: ggml_qnn_list_op_config -> ggml_qnn_matmul_op_config
chraac Sep 11, 2024
bff365a
add tensortype to allow native tensor
chraac Sep 11, 2024
b47929c
remove ggml_tensor param at ggml_qnn_tensor::create_tensor
chraac Sep 11, 2024
50a88fa
postpone the tensor id allocation to add_node
chraac Sep 11, 2024
5da526a
add ggml_qnn_op_config_base
chraac Sep 12, 2024
6e6bfbe
trival change to reduct the param of function
chraac Sep 12, 2024
6ade608
split bind_tensors into bind_input_tensors and bind_output_tensors
chraac Sep 12, 2024
c1bd94c
implement ggml_qnn_single_op_config::create_tensors
chraac Sep 13, 2024
74d5016
tensor: add bind buffer
chraac Sep 15, 2024
f53c016
add parameter tensor type
chraac Sep 19, 2024
3b69e71
implement add_tensor_param
chraac Sep 19, 2024
ed181a1
set qnn_instance only at constructor
chraac Sep 19, 2024
8f22c15
set transpose tensor param
chraac Sep 19, 2024
189325f
move create_op_constructor into op-config module
chraac Sep 20, 2024
222f9a1
create QNN_OP_MAT_MUL from ggml_qnn_matmul_op_config
chraac Sep 20, 2024
378d2ba
try fix crash
chraac Sep 20, 2024
72f0f65
fix compiling error at older ndk (r23c)
chraac Sep 28, 2024
d205acb
fix crash
chraac Sep 29, 2024
660a8ae
Merge branch 'dev-refactoring' into dev-multi-op-in-one-graph
chraac Sep 30, 2024
2dc0bbd
fix parameter tensor name
chraac Oct 1, 2024
07fc1e6
update tensor dimension assignment and add TODO
chraac Oct 2, 2024
fc8b521
fix mat_mul graph creating
chraac Oct 3, 2024
a6deb22
fix MUL_MAT_256x16x10x1_256x1x10x1_16x1x10x1
chraac Oct 3, 2024
938075c
Merge branch 'dev-refactoring' into dev-multi-op-in-one-graph
chraac Oct 11, 2024
82fcd12
append type to graph cache key
chraac Oct 11, 2024
b2281a0
wip
chraac Oct 11, 2024
d923257
fix supported op
chraac Oct 11, 2024
42a7c41
update comment
chraac Oct 13, 2024
4071999
disable op other than add and mat_mul
chraac Oct 13, 2024
328369a
add convert op to adapt multi input/output format
chraac Oct 15, 2024
7c798c0
disable f16 for cpu backend according to official doc
chraac Oct 19, 2024
4688211
add supported data types flags in each backend
chraac Oct 19, 2024
c560733
remove unused functions
chraac Oct 23, 2024
173371c
append output type to graph key
chraac Oct 23, 2024
baca4cb
Merge branch 'dev-refactoring' into dev-multi-op-in-one-graph
chraac Oct 23, 2024
2547eca
fix gpu backend by disable the different data type op
chraac Oct 24, 2024
657db64
fix cpu backend support ops
chraac Oct 24, 2024
327b3db
fix duplicated tensor name
chraac Oct 26, 2024
1c7f136
append op name
chraac Oct 28, 2024
452197c
suppress warning
chraac Oct 28, 2024
8448acd
remove unused code
chraac Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 28 additions & 8 deletions ggml/src/ggml-qnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,30 @@ struct qnn_device_caps {
const char *description;
const char *lib_name;
enum ggml_backend_dev_type type;

// TODO: should get this caps from device
std::unordered_set<ggml_type> supported_types;
};

const qnn_device_caps kDeviceCaps[GGML_QNN_MAX_DEVICES]{
{ "qnn-cpu", "Qualcomm Kryo CPU", "libQnnCpu.so", GGML_BACKEND_DEVICE_TYPE_CPU }, /* QNN_BACKEND_CPU */
{ "qnn-gpu", "Qualcomm Adreno GPU", "libQnnGpu.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_GPU */
{ "qnn-npu", "Qualcomm NPU", "libQnnHtp.so", GGML_BACKEND_DEVICE_TYPE_GPU }, /* QNN_BACKEND_NPU */
{ "qnn-cpu",
"Qualcomm Kryo CPU",
"libQnnCpu.so",
GGML_BACKEND_DEVICE_TYPE_CPU,
{ GGML_TYPE_F32,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/CpuOpDefSupplement.html#matmul
{ "qnn-gpu",
"Qualcomm Adreno GPU",
"libQnnGpu.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{ GGML_TYPE_F32,
GGML_TYPE_F16 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/GpuOpDefSupplement.html#matmul
{ "qnn-npu",
"Qualcomm NPU",
"libQnnHtp.so",
GGML_BACKEND_DEVICE_TYPE_GPU,
{ GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_I16,
GGML_TYPE_I8 } }, // https://docs.qualcomm.com/bundle/publicresource/topics/80-63442-50/HtpOpDefSupplement.html#matmul
};

class ggml_backend_qnn_buffer_context {
Expand Down Expand Up @@ -340,9 +358,10 @@ void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev, struct ggml_backe
props->type = ggml_backend_qnn_device_get_type(dev);
ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* async */ false,
/* host_buffer */ false,
/* events */ false,
/* async */ false,
/* host_buffer */ false,
/* buffer_from_host_ptr */ false,
/* events */ false,
};
}

Expand Down Expand Up @@ -412,6 +431,7 @@ ggml_backend_t ggml_backend_qnn_init_with_device_context(ggml_backend_dev_t dev,
dev_ctx->instance = instance;
dev_ctx->qnn_interface = qnn_interface;
dev_ctx->socinfo = instance->get_soc_info();
dev_ctx->supported_types = kDeviceCaps[device_index].supported_types;

ggml_backend_t qnn_backend = new ggml_backend{
/* .guid = */ ggml_backend_qnn_guid(),
Expand Down Expand Up @@ -440,8 +460,8 @@ ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_ptr(ggml_backend_dev_t
}

bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor *op) {
GGML_UNUSED(dev);
return qnn::ggml_qnn_supports_op(op);
auto *device_ctx = get_device_context(dev);
return qnn::ggml_qnn_supports_op(device_ctx, op);
}

bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
Expand Down
114 changes: 71 additions & 43 deletions ggml/src/ggml-qnn/backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,41 +108,20 @@ std::string get_graph_key(const std::string &op_name, const std::array<ggml_tens
const std::array<ggml_tensor *, _OutputSize> &outputs) {
constexpr static const auto append_dimensions = [](std::string &key, const ggml_tensor *tensor) {
char buffer[256] = {};
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], (long)tensor->ne[3]);
snprintf(buffer, sizeof(buffer), "_%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
(long)tensor->ne[2], (long)tensor->ne[3], qnn::get_ggml_type_name(tensor->type));
key += buffer;
};

std::string graph_key(op_name);
for (auto &input : inputs) {
append_dimensions(graph_key, input);
}
for (auto &output : outputs) {
append_dimensions(graph_key, output);
}

graph_key += qnn::get_ggml_type_name(outputs.front()->type);
return graph_key;
}

qnn::ggml_op_constructor_t generate_common_op_constructor(const std::string &op_name) {
if (op_name == QNN_OP_MAT_MUL) {
// For QNN_OP_MAT_MUL, we need to transpose the input tensor
return [](const std::string &name) {
auto config = std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL);
Qnn_Scalar_t scalar = QNN_SCALAR_INIT;
scalar.dataType = QNN_DATATYPE_BOOL_8;
scalar.bool8Value = true;
config->add_scalar_param(QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar);
QNN_LOG_DEBUG("add scalar param %s\n", QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0);
return config;
};
}

return [op_name](const std::string &name) {
return std::make_unique<qnn::ggml_qnn_op_config>(name, QNN_OP_PACKAGE_NAME_QTI_AISW, op_name);
};
}

constexpr const char *kGgmlOpToQnnOp[] = {
nullptr, // GGML_OP_NONE
nullptr, // GGML_OP_DUP
Expand Down Expand Up @@ -278,7 +257,7 @@ qnn::ggml_qnn_graph *get_qnn_graph_from_cache(ggml_backend_qnn_device_context *c
return nullptr;
}

auto op_constructor = generate_common_op_constructor(kGgmlOpToQnnOp[op]);
auto op_constructor = qnn::create_op_constructor(kGgmlOpToQnnOp[op]);
if (!graph->build_graph(op_constructor, to_ggml_tensor_array<_InputSize>(inputs),
to_ggml_tensor_array<_OutputSize>(outputs))) {
QNN_LOG_ERROR("build_graph failed\n");
Expand Down Expand Up @@ -542,11 +521,57 @@ static constexpr const ggml_qnn_binary_op_t kQnnBinaryOpsTable[] = {
static_assert(sizeof(kQnnBinaryOpsTable) / sizeof(kQnnBinaryOpsTable[0]) == GGML_OP_COUNT,
"GGML_OP_COUNT does not match the size of the kQnnBinaryOpsTable table");

bool ggml_qnn_supports_tensor(ggml_backend_qnn_device_context *ctx, const ggml_tensor *tensor) {
switch (tensor->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
if (ctx->supported_types.find(tensor->type) == ctx->supported_types.end()) {
QNN_LOG_DEBUG("unsupported data type GGML_TYPE_F16 for cpu backend");
return false;
}
break;
default:
QNN_LOG_DEBUG("unsupported data type %d", tensor->type);
return false;
}

return true;
}

bool ggml_qnn_supports_matmul_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
GGML_UNUSED(ctx);

auto *src0 = op->src[0];
auto *src1 = op->src[1];
if (src0->type != src1->type || src0->type != op->type) {
// current qnn implementation only supports the same type for src0 and src1
QNN_LOG_DEBUG("src0 type %d and src1 type %d and op type %d are not equal", src0->type, src1->type, op->type);
return false;
}

if (src0->ne[2] != src1->ne[2] || src0->ne[3] != src1->ne[3]) {
/*
* TODO: remove the blocker here when qnn backend supports mul_mat like this:
* [ne03, ne02, n, k] * [ne03 * x, ne02 * y, m, k] -> [ne03 * x, ne02 * y, m, n]
*/
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
return false;
}

return true;
}

} // namespace

namespace qnn {

bool ggml_qnn_supports_op(const ggml_tensor *op) {
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op) {
if (op->op == GGML_OP_NONE) {
return true;
}

if (op->op == GGML_OP_UNARY) {
if (!kQnnUnaryOpsTable[kGgmlUnaryOpStart + ggml_get_unary_op(op)]) {
QNN_LOG_DEBUG("unsupported unary op %d", ggml_get_unary_op(op));
Expand All @@ -557,35 +582,38 @@ bool ggml_qnn_supports_op(const ggml_tensor *op) {
QNN_LOG_DEBUG("src0 is nullptr");
return false;
}
} else if (op->op != GGML_OP_NONE) {
} else {
if (!kQnnUnaryOpsTable[op->op] && !kQnnBinaryOpsTable[op->op]) {
QNN_LOG_DEBUG("unsupported op %d", op->op);
return false;
}

if (!op->src[0] || !op->src[1]) {
auto *src0 = op->src[0];
auto *src1 = op->src[1];
if (!src0 || !src1) {
QNN_LOG_DEBUG("src0 or src1 is nullptr");
return false;
}

#ifndef NDEBUG
if (op->op == GGML_OP_ADD && !is_tensor_dimensions_equal(op->src[0], op->src[1])) {
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
if (!ggml_qnn_supports_tensor(ctx, src0) || !ggml_qnn_supports_tensor(ctx, src1) ||
!ggml_qnn_supports_tensor(ctx, op)) {
return false;
}
#endif
}

switch (op->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q4_0:
break;
default:
QNN_LOG_DEBUG("unsupported src0 type %d", op->src[0]->type);
return false;
switch (op->op) {
case GGML_OP_ADD:
if (!is_tensor_dimensions_equal(src0, src1)) {
QNN_LOG_DEBUG("src0 and src1 dimensions are not equal");
return false;
}
break;

case GGML_OP_MUL_MAT:
return ggml_qnn_supports_matmul_op(ctx, op);

default:
return false;
}
}

return true;
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-qnn/backend-ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

namespace qnn {

bool ggml_qnn_supports_op(const ggml_tensor *op);
bool ggml_qnn_supports_op(ggml_backend_qnn_device_context *ctx, const ggml_tensor *op);
bool ggml_qnn_forward(ggml_backend_qnn_device_context *ctx, struct ggml_tensor *tensor);

} // namespace qnn
2 changes: 2 additions & 0 deletions ggml/src/ggml-qnn/backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>

#include "ggml.h"

Expand All @@ -26,6 +27,7 @@ struct ggml_backend_qnn_device_context {

// initialize in init
qnn::qcom_socinfo socinfo = {};
std::unordered_set<ggml_type> supported_types;
std::shared_ptr<qnn::qnn_instance> instance;
std::shared_ptr<qnn::qnn_interface> qnn_interface;

Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-qnn/buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
namespace qnn {
class ggml_qnn_rpc_buffer {
public:
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, size_t size, uint32_t rank, uint32_t *dimensions,
Qnn_DataType_t data_type) :
ggml_qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
uint32_t *dimensions, Qnn_DataType_t data_type) :
_qnn_instance(qnn_instance), _size(size) {

_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(void *)));
Expand Down
Loading