diff --git a/ggml/src/ggml-cann/acl_tensor.cpp b/ggml/src/ggml-cann/acl_tensor.cpp old mode 100755 new mode 100644 index 8ffac31dd661a..8958ebcd78704 --- a/ggml/src/ggml-cann/acl_tensor.cpp +++ b/ggml/src/ggml-cann/acl_tensor.cpp @@ -51,28 +51,31 @@ aclDataType ggml_cann_type_mapping(ggml_type type) { return ACL_DT_UNDEFINED; } -aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne, - size_t* nb, int64_t dims, aclFormat format, - size_t offset) { +aclTensor * ggml_cann_create_tensor(const ggml_tensor * tensor, + int64_t * ne, + size_t * nb, + int64_t dims, + aclFormat format, + size_t offset) { // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be // added. int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2]; if (ne == nullptr) { for (int i = 0; i < GGML_MAX_DIMS; i++) { - acl_ne[i] = tensor->ne[i]; + acl_ne[i] = tensor->ne[i]; // The step size of acl is in elements. acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor); } } else { // With bcast for (int i = 0; i < dims; i++) { - acl_ne[i] = ne[i]; + acl_ne[i] = ne[i]; acl_stride[i] = nb[i] / ggml_element_size(tensor); } } - int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims); + int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims); int64_t acl_storage_len = 1; for (int i = 0; i < final_dims; i++) { acl_storage_len += (acl_ne[i] - 1) * acl_stride[i]; @@ -84,15 +87,13 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne, std::reverse(acl_ne, acl_ne + final_dims); std::reverse(acl_stride, acl_stride + final_dims); - aclTensor* acl_tensor = aclCreateTensor( - acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride, - elem_offset, format, &acl_storage_len, 1, - tensor->data); + aclTensor * acl_tensor = aclCreateTensor(acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride, + elem_offset, format, &acl_storage_len, 1, tensor->data); return acl_tensor; } -bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) { +bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1) { for (int i = 0; i < GGML_MAX_DIMS; i++) { if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) { return true; @@ -101,15 +102,16 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) { return false; } -int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, - const ggml_tensor* src1, - int64_t* bcast_src0_ne, - int64_t* bcast_src1_ne, size_t* bcast_src0_nb, - size_t* bcast_src1_nb) { +int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0, + const ggml_tensor * src1, + int64_t * bcast_src0_ne, + int64_t * bcast_src1_ne, + size_t * bcast_src0_nb, + size_t * bcast_src1_nb) { GGML_ASSERT(ggml_can_repeat(src1, src0)); int bcast_dim_cnt = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { - int64_t nr = src0->ne[i] / src1->ne[i]; + int64_t nr = src0->ne[i] / src1->ne[i]; bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr; bcast_src1_ne[bcast_dim_cnt] = src1->ne[i]; bcast_src0_nb[bcast_dim_cnt] = src0->nb[i]; @@ -119,21 +121,26 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, // Need to add an extra dim. bcast_src0_ne[bcast_dim_cnt] = nr; bcast_src1_ne[bcast_dim_cnt] = 1; - bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * - bcast_src0_ne[bcast_dim_cnt - 1]; - bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * - bcast_src1_ne[bcast_dim_cnt - 1]; + bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * bcast_src0_ne[bcast_dim_cnt - 1]; + bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * bcast_src1_ne[bcast_dim_cnt - 1]; bcast_dim_cnt++; } } return bcast_dim_cnt; } -int64_t ggml_cann_get_mulmat_bcast_shape( - const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne, - const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb, - int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne, - size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) { +int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne, + const int64_t * weight_ne, + const int64_t * dst_ne, + const size_t * input_nb, + const size_t * weight_nb, + const size_t * dst_nb, + int64_t * bcast_input_ne, + int64_t * bcast_weight_ne, + int64_t * bcast_dst_ne, + size_t * bcast_input_nb, + size_t * bcast_weight_nb, + size_t * bcast_dst_nb) { // input and dst shoule in same shape, except first two dims. GGML_ASSERT(input_ne[2] == dst_ne[2]); GGML_ASSERT(input_ne[3] == dst_ne[3]); @@ -148,34 +155,30 @@ int64_t ggml_cann_get_mulmat_bcast_shape( // Do not use bcast in the first two dimensions because we only support // the bcast batch dimension. Just copy them. if (i < 2 || nr == 1) { - bcast_input_ne[bcast_dim_cnt] = input_ne[i]; + bcast_input_ne[bcast_dim_cnt] = input_ne[i]; bcast_weight_ne[bcast_dim_cnt] = weight_ne[i]; - bcast_dst_ne[bcast_dim_cnt] = dst_ne[i]; + bcast_dst_ne[bcast_dim_cnt] = dst_ne[i]; - bcast_input_nb[bcast_dim_cnt] = input_nb[i]; + bcast_input_nb[bcast_dim_cnt] = input_nb[i]; bcast_weight_nb[bcast_dim_cnt] = weight_nb[i]; - bcast_dst_nb[bcast_dim_cnt] = dst_nb[i]; + bcast_dst_nb[bcast_dim_cnt] = dst_nb[i]; bcast_dim_cnt++; } else { // Need to add an extra dim. - bcast_input_ne[bcast_dim_cnt] = nr; - bcast_dst_ne[bcast_dim_cnt] = nr; + bcast_input_ne[bcast_dim_cnt] = nr; + bcast_dst_ne[bcast_dim_cnt] = nr; bcast_weight_ne[bcast_dim_cnt] = 1; - bcast_input_nb[bcast_dim_cnt] = input_nb[i]; - bcast_dst_nb[bcast_dim_cnt] = dst_nb[i]; + bcast_input_nb[bcast_dim_cnt] = input_nb[i]; + bcast_dst_nb[bcast_dim_cnt] = dst_nb[i]; bcast_weight_nb[bcast_dim_cnt] = weight_nb[i]; bcast_dim_cnt++; - bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr; - bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr; + bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr; + bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr; bcast_weight_ne[bcast_dim_cnt] = weight_ne[i]; - bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] * - bcast_input_ne[bcast_dim_cnt - 1]; - bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] * - bcast_dst_ne[bcast_dim_cnt - 1]; - bcast_weight_nb[bcast_dim_cnt] = - bcast_weight_nb[bcast_dim_cnt - 1] * - bcast_weight_ne[bcast_dim_cnt - 1]; + bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] * bcast_input_ne[bcast_dim_cnt - 1]; + bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] * bcast_dst_ne[bcast_dim_cnt - 1]; + bcast_weight_nb[bcast_dim_cnt] = bcast_weight_nb[bcast_dim_cnt - 1] * bcast_weight_ne[bcast_dim_cnt - 1]; bcast_dim_cnt++; } } diff --git a/ggml/src/ggml-cann/acl_tensor.h b/ggml/src/ggml-cann/acl_tensor.h old mode 100755 new mode 100644 index 93f09937efb31..cb17ebcc1bbe2 --- a/ggml/src/ggml-cann/acl_tensor.h +++ b/ggml/src/ggml-cann/acl_tensor.h @@ -62,10 +62,12 @@ aclDataType ggml_cann_type_mapping(ggml_type type); * @param offset Offset in bytes for the ACL tensor data. Defaults to 0. * @return Pointer to the created ACL tensor. */ -aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr, - size_t* nb = nullptr, int64_t dims = 0, - aclFormat format = ACL_FORMAT_ND, - size_t offset = 0); +aclTensor * ggml_cann_create_tensor(const ggml_tensor * tensor, + int64_t * ne = nullptr, + size_t * nb = nullptr, + int64_t dims = 0, + aclFormat format = ACL_FORMAT_ND, + size_t offset = 0); /** * @brief Template for creating an ACL tensor from provided parameters. typename TYPE @@ -87,12 +89,15 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null * @param offset Offset in bytes for the ACL tensor data. Defaults to 0. * @return Pointer to the created ACL tensor. */ -template -aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, - TYPE type_size, int64_t* ne, TYPE* nb, - int64_t dims, - aclFormat format = ACL_FORMAT_ND, - size_t offset = 0) { +template +aclTensor * ggml_cann_create_tensor(void * data_ptr, + aclDataType dtype, + TYPE type_size, + int64_t * ne, + TYPE * nb, + int64_t dims, + aclFormat format = ACL_FORMAT_ND, + size_t offset = 0) { int64_t tmp_ne[GGML_MAX_DIMS * 2]; int64_t tmp_stride[GGML_MAX_DIMS * 2]; @@ -109,9 +114,8 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, std::reverse(tmp_ne, tmp_ne + dims); std::reverse(tmp_stride, tmp_stride + dims); - aclTensor* acl_tensor = - aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, - format, &acl_storage_len, 1, data_ptr); + aclTensor * acl_tensor = + aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, format, &acl_storage_len, 1, data_ptr); return acl_tensor; } @@ -132,7 +136,7 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, * to 1. If such a dimension is found, broadcasting is required to align t1 * with t0 for element-wise operations. */ -bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1); +bool ggml_cann_need_bcast(const ggml_tensor * t0, const ggml_tensor * t1); /** * @brief Computes broadcast shapes and strides for two ggml_tensors. @@ -187,19 +191,21 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1); * dim1 in a inserted dim, should add nb for dim1, * and all other nb moves to next in order. */ -int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, - int64_t* bcast_ne_src0, int64_t* bcast_ne_src1, - size_t* bcast_nb_src0, size_t* bcast_nb_src1); +int64_t ggml_cann_get_bcast_shape(const ggml_tensor * src0, + const ggml_tensor * src1, + int64_t * bcast_ne_src0, + int64_t * bcast_ne_src1, + size_t * bcast_nb_src0, + size_t * bcast_nb_src1); // Bcast macro to avoid duplicate code. -#define BCAST_SHAPE(src0, src1) \ - int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2]; \ - int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2]; \ - size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2]; \ - size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2]; \ - int64_t bcast_dims = ggml_cann_get_bcast_shape( \ - src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \ - bcast_##src1##_nb); +#define BCAST_SHAPE(src0, src1) \ + int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2]; \ + int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2]; \ + size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2]; \ + size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2]; \ + int64_t bcast_dims = ggml_cann_get_bcast_shape(src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, \ + bcast_##src0##_nb, bcast_##src1##_nb); #define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims @@ -233,26 +239,31 @@ int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* sr * before cast dim. * @sa ggml_cann_get_bcast_shape */ -int64_t ggml_cann_get_mulmat_bcast_shape( - const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne, - const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb, - int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne, - size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb); +int64_t ggml_cann_get_mulmat_bcast_shape(const int64_t * input_ne, + const int64_t * weight_ne, + const int64_t * dst_ne, + const size_t * input_nb, + const size_t * weight_nb, + const size_t * dst_nb, + int64_t * bcast_input_ne, + int64_t * bcast_weight_ne, + int64_t * bcast_dst_ne, + size_t * bcast_input_nb, + size_t * bcast_weight_nb, + size_t * bcast_dst_nb); // Bcast macro to avoid duplicate code. -#define BCAST_MUL_MAT_SHAPE(input, weight, dst) \ - int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2]; \ - int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2]; \ - int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2]; \ - size_t bcast_##input##_nb[GGML_MAX_DIMS * 2]; \ - size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2]; \ - size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2]; \ - int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape( \ - input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \ - bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne, \ - bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb); +#define BCAST_MUL_MAT_SHAPE(input, weight, dst) \ + int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2]; \ + int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2]; \ + int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2]; \ + size_t bcast_##input##_nb[GGML_MAX_DIMS * 2]; \ + size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2]; \ + size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2]; \ + int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape( \ + input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, bcast_##input##_ne, bcast_##weight##_ne, \ + bcast_##dst##_ne, bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb); -#define BCAST_MUL_MAT_PARAM(tensor) \ - bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims +#define BCAST_MUL_MAT_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims #endif // CANN_ACL_TENSOR_H diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp old mode 100755 new mode 100644 index 2857e080b4c16..f030ea0136a95 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -86,9 +86,12 @@ #include "../ggml-common.h" - -void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclTensor ** acl_src0, - aclTensor ** acl_src1, aclTensor ** acl_dst) { +void bcast_shape(ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + aclTensor ** acl_src0, + aclTensor ** acl_src1, + aclTensor ** acl_dst) { GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0)); // Need bcast if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) { @@ -103,40 +106,40 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, aclT } } -void ggml_cann_op_unary( - std::function unary_op, - ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_op_unary(std::function unary_op, + ggml_backend_cann_context & ctx, + ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); unary_op(ctx, acl_src, acl_dst); ggml_cann_release_resources(ctx, acl_src, acl_dst); } -void ggml_cann_op_unary_gated( - std::function unary_op, - ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; +void ggml_cann_op_unary_gated(std::function unary_op, + ggml_backend_cann_context & ctx, + ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; GGML_ASSERT(ggml_is_contiguous_1(src0)); GGML_ASSERT(ggml_is_contiguous_1(dst)); const int32_t swapped = ggml_get_op_params_i32(dst, 1); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); - aclTensor *acl_src0 = nullptr, *acl_src1 = nullptr; - if(src1) { + aclTensor * acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src0 = nullptr, *acl_src1 = nullptr; + if (src1) { GGML_ASSERT(ggml_is_contiguous_1(src1)); GGML_ASSERT(src0->type == src1->type); acl_src0 = ggml_cann_create_tensor(src0); acl_src1 = ggml_cann_create_tensor(src1); } else { - int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]}; - size_t nb[] = {src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]}; - acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0); + int64_t ne[] = { src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3] }; + size_t nb[] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] }; + acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0); acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0)); if (swapped) { std::swap(acl_src0, acl_src1); @@ -159,10 +162,12 @@ void ggml_cann_op_unary_gated( * @param repeat_array The array specifying the number of repetitions along each * dimension. */ -static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* repeat_array) { +static void aclnn_repeat(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_dst, + int64_t * repeat_array) { // repeat tensor along each dim with repeat_array - aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); + aclIntArray * repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst); ggml_cann_release_resources(ctx, repeats); @@ -181,61 +186,63 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param cast_data_type The target data type to which the source tensor will be * casted. */ -static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, aclDataType cast_data_type) { +static void aclnn_cast(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_dst, + aclDataType cast_data_type) { GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst); } -void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; GGML_ASSERT(ggml_can_repeat(src, dst)); - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); - int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], - dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]}; + int64_t repeatsArray[] = { dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1], + dst->ne[0] / src->ne[0] }; aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray); ggml_cann_release_resources(ctx, acl_src, acl_dst); } -void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, - aclTensor* acl_src1, aclTensor* acl_dst) { - float alphaValue = 1.0f; - aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); - if (acl_dst != nullptr) +void aclnn_add(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) { + float alphaValue = 1.0f; + aclScalar * alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + if (acl_dst != nullptr) { GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst); - else + } else { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha); + } ggml_cann_release_resources(ctx, alpha); } -void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0, - aclTensor* acl_src1, aclTensor* acl_dst) { - float alphaValue = 1.0f; - aclScalar* alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); - if (acl_dst != nullptr) +void aclnn_sub(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) { + float alphaValue = 1.0f; + aclScalar * alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + if (acl_dst != nullptr) { GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha, acl_dst); - else + } else { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha); + } ggml_cann_release_resources(ctx, alpha); } -void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, aclTensor* acl_dst) { - if (acl_dst != nullptr) +void aclnn_mul(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) { + if (acl_dst != nullptr) { GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst); - else + } else { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other); + } } -void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, aclTensor* acl_dst) { - if (acl_dst != nullptr) +void aclnn_div(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) { + if (acl_dst != nullptr) { GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst); - else + } else { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other); + } } /** @@ -260,9 +267,12 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param inplace Flag indicating whether to perform the operation in-place on * `acl_src`. */ -static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, - float scale, aclTensor* acl_dst, bool inplace) { - aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); +static void aclnn_muls(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + float scale, + aclTensor * acl_dst, + bool inplace) { + aclScalar * acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); if (inplace) { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale); } else { @@ -271,19 +281,18 @@ static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, ggml_cann_release_resources(ctx, acl_scale); } -void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); float negative_slope; memcpy(&negative_slope, dst->op_params, sizeof(float)); - aclScalar* acl_negative_slope = - aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT); + aclScalar * acl_negative_slope = aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst); ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst); @@ -299,26 +308,27 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * stored. * @param concat_dim The dimension along which the tensors will be concatenated. */ -static void aclnn_concat(ggml_backend_cann_context& ctx, - aclTensorList* tensorList, aclTensor* acl_dst, - int64_t concat_dim) { +static void aclnn_concat(ggml_backend_cann_context & ctx, + aclTensorList * tensorList, + aclTensor * acl_dst, + int64_t concat_dim) { GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst); } -void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; - aclTensor* acl_src0 = ggml_cann_create_tensor(src0); - aclTensor* acl_src1 = ggml_cann_create_tensor(src1); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); +void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; + aclTensor * acl_src0 = ggml_cann_create_tensor(src0); + aclTensor * acl_src1 = ggml_cann_create_tensor(src1); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); const int32_t dim = ggml_get_op_params_i32(dst, 0); GGML_ASSERT(dim >= 0 && dim < 4); int32_t acl_dim = 3 - dim; - aclTensor* tensors[] = {acl_src0, acl_src1}; - aclTensorList* tensor_list = aclCreateTensorList(tensors, 2); + aclTensor * tensors[] = { acl_src0, acl_src1 }; + aclTensorList * tensor_list = aclCreateTensorList(tensors, 2); aclnn_concat(ctx, tensor_list, acl_dst, acl_dim); ggml_cann_release_resources(ctx, tensor_list, acl_dst); @@ -341,162 +351,157 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param step The step size between consecutive values. * @param n_elements The number of elements in the destination tensor. */ -static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, - float start, float stop, float step, - int64_t n_elements) { - int64_t steps = (int64_t)std::ceil((stop - start) / step); +static void aclnn_arange(ggml_backend_cann_context & ctx, + aclTensor * acl_dst, + float start, + float stop, + float step, + int64_t n_elements) { + int64_t steps = (int64_t) std::ceil((stop - start) / step); GGML_ASSERT(n_elements == steps); - aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT); - aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT); - aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT); + aclScalar * acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT); + aclScalar * acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT); + aclScalar * acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst); ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step); } -void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->type == GGML_TYPE_F32); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); int64_t n_elements = ggml_nelements(dst); - float start; - float stop; - float step; - memcpy(&start, (float*)dst->op_params + 0, sizeof(float)); - memcpy(&stop, (float*)dst->op_params + 1, sizeof(float)); - memcpy(&step, (float*)dst->op_params + 2, sizeof(float)); + float start; + float stop; + float step; + memcpy(&start, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&stop, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&step, (float *) dst->op_params + 2, sizeof(float)); aclnn_arange(ctx, acl_dst, start, stop, step, n_elements); ggml_cann_release_resources(ctx, acl_dst); } -void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; float min; float max; memcpy(&min, dst->op_params, sizeof(float)); - memcpy(&max, (float*)dst->op_params + 1, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); - aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT); - aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT); + aclScalar * acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT); + aclScalar * acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst); ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst); } -void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; // scale factor float v; memcpy(&v, dst->op_params, sizeof(float)); - aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT); - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclScalar * scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst); ggml_cann_release_resources(ctx, scale, acl_src, acl_dst); } -void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; - enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0]; - - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); - ggml_cann_pool_alloc temp_buffer_allocator( - ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); - void* buffer = temp_buffer_allocator.get(); - aclTensor* tmp_tensor = - ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), - dst->ne, dst->nb, GGML_MAX_DIMS); - GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), - tmp_tensor); +void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); + void * buffer = temp_buffer_allocator.get(); + aclTensor * tmp_tensor = + ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS); + GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor); GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst); ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst); } -void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); float eps; memcpy(&eps, dst->op_params, sizeof(float)); - std::vector normData = {dst->ne[0]}; - aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size()); - GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr, - eps, acl_dst, nullptr, nullptr); + std::vector normData = { dst->ne[0] }; + aclIntArray * norm = aclCreateIntArray(normData.data(), normData.size()); + GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr, eps, acl_dst, nullptr, nullptr); ggml_cann_release_resources(ctx, norm, acl_src, acl_dst); } -void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); int n_groups = dst->op_params[0]; float eps; memcpy(&eps, dst->op_params + 1, sizeof(float)); - int64_t N = src->ne[3]; - int64_t C = src->ne[2]; + int64_t N = src->ne[3]; + int64_t C = src->ne[2]; int64_t HxW = src->ne[1] * src->ne[0]; - size_t type_size = ggml_type_size(src->type); - int64_t ne[] = {n_groups, N}; - size_t nb[] = {type_size, type_size * n_groups}; - size_t n_bytes = N * n_groups; + size_t type_size = ggml_type_size(src->type); + int64_t ne[] = { n_groups, N }; + size_t nb[] = { type_size, type_size * n_groups }; + size_t n_bytes = N * n_groups; ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2); - void* buffer = temp_buffer_allocator.get(); - aclTensor* acl_mean_out = ggml_cann_create_tensor( - buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); - aclTensor* acl_rstd_out = ggml_cann_create_tensor( - (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); - - GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, - acl_dst, acl_mean_out, acl_rstd_out); + void * buffer = temp_buffer_allocator.get(); + aclTensor * acl_mean_out = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); + aclTensor * acl_rstd_out = + ggml_cann_create_tensor((char *) buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); + + GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst, acl_mean_out, + acl_rstd_out); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out); } -void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; +void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; - size_t nb1 = ((int32_t*)dst->op_params)[0]; - size_t nb2 = ((int32_t*)dst->op_params)[1]; - size_t nb3 = ((int32_t*)dst->op_params)[2]; - size_t offset = ((int32_t*)dst->op_params)[3]; - bool inplace = (bool)((int32_t*)dst->op_params)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3}; + size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 }; - aclTensor* acl_dst = ggml_cann_create_tensor( - dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); - aclTensor* acl_src1 = ggml_cann_create_tensor(src1); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + aclTensor * acl_src1 = ggml_cann_create_tensor(src1); - aclScalar* alpha = nullptr; - float alphaValue = 1.0f; - alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + aclScalar * alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); if (!inplace) { size_t cpy_size = ggml_nbytes(dst); - ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size, - ACL_MEMCPY_DEVICE_TO_DEVICE); - aclTensor* acl_src0 = ggml_cann_create_tensor( - src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + ggml_cann_async_memcpy(ctx, dst->data, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE); + aclTensor * acl_src0 = ggml_cann_create_tensor(src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst); ggml_cann_release_resources(ctx, acl_src0); @@ -516,39 +521,34 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param dim An array of dimension indices. * @param dim_size The number of dimensions. */ -static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst, - int64_t* dim, size_t dim_size) { +static void aclnn_reduce_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t * dim, size_t dim_size) { GGML_ASSERT(dst->ne[0] == 1); - ggml_tensor* src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); - aclIntArray* reduce_dims = aclCreateIntArray(dim, dim_size); + ggml_tensor * src = dst->src[0]; + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); + aclIntArray * reduce_dims = aclCreateIntArray(dim, dim_size); - GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true, - ggml_cann_type_mapping(dst->type), acl_dst); + GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true, ggml_cann_type_mapping(dst->type), acl_dst); ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims); } -void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - int64_t reduce_dims[] = {3}; +void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + int64_t reduce_dims[] = { 3 }; aclnn_reduce_sum(ctx, dst, reduce_dims, 1); } -void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - int64_t reduce_dims[] = {0, 1, 2, 3}; +void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + int64_t reduce_dims[] = { 0, 1, 2, 3 }; aclnn_reduce_sum(ctx, dst, reduce_dims, 4); } -void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; - aclTensor* acl_src = - ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); +void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + aclTensor * acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - std::vector output_size{dst->ne[1], dst->ne[0]}; - auto output_size_array = aclCreateIntArray(output_size.data(), 2); + std::vector output_size{ dst->ne[1], dst->ne[0] }; + auto output_size_array = aclCreateIntArray(output_size.data(), 2); GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst); ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array); @@ -568,20 +568,22 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, * The size of the array should be twice the number of dimensions of the tensor. * @param value The value to be used for padding. The default value is 0.0. */ -static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* paddings, - float value = 0.0f) { - aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2); - aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); +static void aclnn_pad(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_dst, + int64_t * paddings, + float value = 0.0f) { + aclIntArray * acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2); + aclScalar * acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst); ggml_cann_release_resources(ctx, acl_pad, acl_value); } -void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); +void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); // padding: value in the array means how much distance will be padding. // the position of elements in the array means which dirction to padding, @@ -596,7 +598,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const int32_t lp3 = ggml_get_op_params_i32(dst, 6); const int32_t rp3 = ggml_get_op_params_i32(dst, 7); - int64_t paddings[] = {lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3}; + int64_t paddings[] = { lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 }; aclnn_pad(ctx, acl_src, acl_dst, paddings); ggml_cann_release_resources(ctx, acl_src, acl_dst); } @@ -613,46 +615,41 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param dst The destination tensor where the result will be stored. The source * tensor is referenced by `dst->src[0]`. */ -static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +static void ggml_cann_avg_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - aclTensor* acl_src = - ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - - const int32_t* opts = (const int32_t*)dst->op_params; - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - std::vector kernel_dims = {k1, k0}; - std::vector stride_dims = {s1, s0}; - std::vector padding_avg_dims = {p1, p0}; // (padH, padW) - - auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); - auto* strides = aclCreateIntArray(stride_dims.data(), 2); - auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2); - - bool ceil_mode = false; - bool count_include_pad = true; - int64_t divisor_override = 0; - int8_t cube_math_type = 0; + aclTensor * acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + + const int32_t * opts = (const int32_t *) dst->op_params; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + std::vector kernel_dims = { k1, k0 }; + std::vector stride_dims = { s1, s0 }; + std::vector padding_avg_dims = { p1, p0 }; // (padH, padW) + + auto * kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto * strides = aclCreateIntArray(stride_dims.data(), 2); + auto * paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2); + + bool ceil_mode = false; + bool count_include_pad = true; + int64_t divisor_override = 0; + int8_t cube_math_type = 0; #ifdef ASCEND_310P cube_math_type = 1; #endif - GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg, - ceil_mode, count_include_pad, divisor_override, - cube_math_type, acl_dst); - ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides, - paddings_avg); + GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg, ceil_mode, count_include_pad, + divisor_override, cube_math_type, acl_dst); + ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides, paddings_avg); } /** @@ -667,68 +664,61 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, * @param dst The destination tensor where the result will be stored. The source * tensor is referenced by `dst->src[0]`. */ -static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +static void ggml_cann_max_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - aclTensor* acl_src = - ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor * acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); - const int32_t* opts = (const int32_t*)dst->op_params; - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; + const int32_t * opts = (const int32_t *) dst->op_params; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; - int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], - src->ne[3]}; - size_t temp_nb[GGML_MAX_DIMS]; + int64_t temp_ne[] = { src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], src->ne[3] }; + size_t temp_nb[GGML_MAX_DIMS]; temp_nb[0] = ggml_element_size(src); for (int i = 1; i < GGML_MAX_DIMS; i++) { temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1]; } - ggml_cann_pool_alloc temp_buffer_allocator( - ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); - void* buffer = temp_buffer_allocator.get(); - aclTensor* tmp_tensor = ggml_cann_create_tensor( - buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb, - GGML_MAX_DIMS, ACL_FORMAT_NCHW); + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); + void * buffer = temp_buffer_allocator.get(); + aclTensor * tmp_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb, + GGML_MAX_DIMS, ACL_FORMAT_NCHW); // pad: see padding in ggml_cann_pad() - int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0}; - float value = -FLT_MAX; + int64_t paddings[] = { p0, p0, p1, p1, 0, 0, 0, 0 }; + float value = -FLT_MAX; aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value); // max_pool - std::vector kernel_dims = {k1, k0}; - std::vector stride_dims = {s1, s0}; + std::vector kernel_dims = { k1, k0 }; + std::vector stride_dims = { s1, s0 }; // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end] - std::vector padding_max_dims = {0, 0, 0, 0}; - std::vector dilation_size = {1, 1}; - auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); - auto* strides = aclCreateIntArray(stride_dims.data(), 2); - auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4); - auto* dilations = aclCreateIntArray(dilation_size.data(), 2); - - bool ceil_mode = false; + std::vector padding_max_dims = { 0, 0, 0, 0 }; + std::vector dilation_size = { 1, 1 }; + auto * kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto * strides = aclCreateIntArray(stride_dims.data(), 2); + auto * paddings_max = aclCreateIntArray(padding_max_dims.data(), 4); + auto * dilations = aclCreateIntArray(dilation_size.data(), 2); + + bool ceil_mode = false; int64_t auto_pads = 0; - GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads, - paddings_max, dilations, ceil_mode, acl_dst); - ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size, - strides, paddings_max, dilations); + GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations, + ceil_mode, acl_dst); + ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size, strides, paddings_max, dilations); } -void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - const int32_t* opts = (const int32_t*)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); +void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + const int32_t * opts = (const int32_t *) dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); switch (op) { case GGML_OP_POOL_AVG: ggml_cann_avg_pool2d(ctx, dst); @@ -752,17 +742,16 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param acl_src The source tensor from which data will be copied. * @param acl_dst The destination tensor where the data will be copied to. */ -static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst) { +static void cann_copy(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src); } -void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; +void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; if (ggml_are_same_shape(src0, dst)) { - aclTensor* acl_src = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); if (dst->type == src0->type) { cann_copy(ctx, acl_src, acl_dst); } else { @@ -770,22 +759,20 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } ggml_cann_release_resources(ctx, acl_src, acl_dst); } else { - void* src_trans_buffer = src0->data; + void * src_trans_buffer = src0->data; ggml_cann_pool_alloc src_buffer_allocator; if (!ggml_is_contiguous(src0)) { - aclTensor* acl_src = ggml_cann_create_tensor(src0); - src_buffer_allocator.alloc(ctx.pool(), - ggml_nelements(src0) * ggml_type_size(src0->type)); + aclTensor * acl_src = ggml_cann_create_tensor(src0); + src_buffer_allocator.alloc(ctx.pool(), ggml_nelements(src0) * ggml_type_size(src0->type)); src_trans_buffer = src_buffer_allocator.get(); size_t src_trans_nb[GGML_MAX_DIMS]; src_trans_nb[0] = ggml_type_size(src0->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; } - aclTensor* src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, src_trans_nb, - GGML_MAX_DIMS); + aclTensor * src_trans_tensor = + ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, src_trans_nb, GGML_MAX_DIMS); cann_copy(ctx, acl_src, src_trans_tensor); ggml_cann_release_resources(ctx, acl_src, src_trans_tensor); } @@ -796,10 +783,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1]; } - aclTensor* trans_acl_src = ggml_cann_create_tensor(src_trans_buffer, - ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type), - dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * trans_acl_src = + ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type), + dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); if (dst->type == src0->type) { cann_copy(ctx, trans_acl_src, acl_dst); @@ -827,17 +814,20 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param type_size The size of each element in the tensor data type. * @return An ACL tensor initialized with zeros. */ -static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, - size_t n_bytes, int64_t* ne, int64_t dims, - aclDataType type, size_t type_size) { +static aclTensor * aclnn_zero(ggml_backend_cann_context & ctx, + void * buffer, + size_t n_bytes, + int64_t * ne, + int64_t dims, + aclDataType type, + size_t type_size) { size_t nb[GGML_MAX_DIMS]; nb[0] = type_size; for (int i = 1; i < dims; i++) { nb[i] = nb[i - 1] * ne[i - 1]; } - aclTensor* zero = - ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims); + aclTensor * zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero); return zero; GGML_UNUSED(n_bytes); @@ -861,15 +851,18 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, * is 1.0). * @return An ACL tensor initialized with value. */ -static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, - size_t n_bytes, int64_t* ne, int64_t dims, - aclDataType type, size_t type_size, - float value = 1.0f) { - aclTensor* acl_tensor = - aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); - float alpha_host = 1.0f; - aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); - aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); +static aclTensor * aclnn_values(ggml_backend_cann_context & ctx, + void * buffer, + size_t n_bytes, + int64_t * ne, + int64_t dims, + aclDataType type, + size_t type_size, + float value = 1.0f) { + aclTensor * acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); + float alpha_host = 1.0f; + aclScalar * alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); + aclScalar * other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor, other, alpha); return acl_tensor; } @@ -884,8 +877,7 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, * @param scalar The scalar value used to fill the tensor. * @param acl_dst The destination tensor to be filled with the scalar value. */ -static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, - aclTensor* acl_dst) { +static void aclnn_fill_scalar(ggml_backend_cann_context & ctx, float scalar, aclTensor * acl_dst) { auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar); ggml_cann_release_resources(ctx, acl_scalar); @@ -913,15 +905,14 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, * initialization via memset or arbitrary values via fill_scalar). * @return An aclTensor pointer created from the cached buffer. */ -static aclTensor* get_cache_acl_tensor( - ggml_backend_cann_context& ctx, - void** buffer, - int64_t &cache_element, - int64_t* ne, - size_t* nb, - ggml_type dtype, - int64_t dims, - float value) { +static aclTensor * get_cache_acl_tensor(ggml_backend_cann_context & ctx, + void ** buffer, + int64_t & cache_element, + int64_t * ne, + size_t * nb, + ggml_type dtype, + int64_t dims, + float value) { // Calculate total number of elements int64_t n_element = 1; for (int i = 0; i < dims; i++) { @@ -940,24 +931,22 @@ static aclTensor* get_cache_acl_tensor( cache_element = n_element; // Initialize cache - int64_t pool_ne[1] = { n_element }; - size_t pool_nb[1] = { ggml_type_size(dtype) }; - aclTensor* acl_value = ggml_cann_create_tensor( - *buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), - pool_ne, pool_nb, 1); + int64_t pool_ne[1] = { n_element }; + size_t pool_nb[1] = { ggml_type_size(dtype) }; + aclTensor * acl_value = + ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), pool_ne, pool_nb, 1); aclnn_fill_scalar(ctx, value, acl_value); ggml_cann_release_resources(ctx, acl_value); } - return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), - ggml_type_size(dtype), ne, nb, dims); + return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), ne, nb, dims); } -void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); float eps; memcpy(&eps, dst->op_params, sizeof(float)); @@ -969,61 +958,50 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1]; } - aclTensor* acl_gamma = get_cache_acl_tensor( - ctx, - &ctx.rms_norm_one_tensor_cache.cache, - ctx.rms_norm_one_tensor_cache.size, - src->ne, - acl_gamma_nb, - dst->type, - 1, // dims - 1.0f // value + aclTensor * acl_gamma = get_cache_acl_tensor(ctx, &ctx.rms_norm_one_tensor_cache.cache, + ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, dst->type, + 1, // dims + 1.0f // value ); // build rstd. - int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]}; - size_t acl_rstd_nb[GGML_MAX_DIMS - 1]; + int64_t acl_rstd_ne[] = { src->ne[1], src->ne[2], src->ne[3] }; + size_t acl_rstd_nb[GGML_MAX_DIMS - 1]; // rstd will always be F32. acl_rstd_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1]; } - aclTensor* acl_rstd = get_cache_acl_tensor( - ctx, - &ctx.rms_norm_zero_tensor_cache.cache, - ctx.rms_norm_zero_tensor_cache.size, - acl_rstd_ne, - acl_rstd_nb, - GGML_TYPE_F32, - GGML_MAX_DIMS - 1, - 0.0f // value - ); + aclTensor * acl_rstd = + get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size, + acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS - 1, + 0.0f // value + ); GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd); ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd); } // TODO: performace is low. -void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, - float value) { - ggml_tensor* src = dst->src[0]; +void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value) { + ggml_tensor * src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); - const int n_past = ((int32_t*)dst->op_params)[0]; + const int n_past = ((int32_t *) dst->op_params)[0]; ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src)); - void* buffer = one_tensor_allocator.get(); + void * buffer = one_tensor_allocator.get(); - aclTensor* mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type), - ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS); + aclTensor * mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type), + ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS); aclnn_fill_scalar(ctx, value, mask_tensor); - aclScalar* alpha = nullptr; - float alphaValue = 1.0f; - alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + aclScalar * alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1); GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst); @@ -1046,25 +1024,27 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, * tensor. * @param dims The number of dimensions in the tensor. */ -static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) { - aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims); +static void aclnn_permute(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_dst, + int64_t * new_dim, + uint64_t dims) { + aclIntArray * acl_dims = aclCreateIntArray(new_dim, dims); GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims, acl_dst); ggml_cann_release_resources(ctx, acl_dims); } -static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, - ggml_tensor* dst, - ggml_tensor* src1, - aclTensor* tmp_cast_tensor, - aclTensor* tmp_im2col_tensor) { +static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context & ctx, + ggml_tensor * dst, + ggml_tensor * src1, + aclTensor * tmp_cast_tensor, + aclTensor * tmp_im2col_tensor) { // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] - int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; - size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + int64_t dst_ne[] = { dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3] }; + size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[3] }; + aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); - int64_t permute_dim[] = {0, 2, 1}; + int64_t permute_dim[] = { 0, 2, 1 }; if (src1->type != dst->type) { aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); } else { @@ -1074,101 +1054,95 @@ static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, ggml_cann_release_resources(ctx, acl_dst); } -static void ggml_cann_im2col_1d_post_process( - ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, - aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, - const std::vector& im2col_op_params) { +static void ggml_cann_im2col_1d_post_process(ggml_backend_cann_context & ctx, + ggml_tensor * dst, + ggml_tensor * src1, + aclTensor * tmp_cast_tensor, + aclTensor * tmp_im2col_tensor, + const std::vector & im2col_op_params) { // get params - const int64_t KH = im2col_op_params[0]; - const int64_t KW = im2col_op_params[1]; - const int64_t IW = im2col_op_params[2]; - const int64_t IC = im2col_op_params[3]; - const int64_t N = im2col_op_params[4]; - const int64_t OH = im2col_op_params[5]; - const int64_t OW = im2col_op_params[6]; - const int64_t s0 = im2col_op_params[7]; - const int64_t p0 = im2col_op_params[8]; - const int64_t d0 = im2col_op_params[9]; + const int64_t KH = im2col_op_params[0]; + const int64_t KW = im2col_op_params[1]; + const int64_t IW = im2col_op_params[2]; + const int64_t IC = im2col_op_params[3]; + const int64_t N = im2col_op_params[4]; + const int64_t OH = im2col_op_params[5]; + const int64_t OW = im2col_op_params[6]; + const int64_t s0 = im2col_op_params[7]; + const int64_t p0 = im2col_op_params[8]; + const int64_t d0 = im2col_op_params[9]; const int64_t n_bytes_factor = im2col_op_params[10]; // Permute: [N, IC * KH * KW, OW * OH] -> // [N, OW * OH * n_bytes_factor, IC * KH * KW] ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); - void* tmp_permute_buffer = tmp_permute_allocator.get(); + void * tmp_permute_buffer = tmp_permute_allocator.get(); - int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; - size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; + int64_t tmp_permute_ne[] = { IC * KH * KW, OW * OH * n_bytes_factor, N }; + size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; tmp_permute_nb[0] = ggml_type_size(dst->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; } - aclTensor* tmp_permute_tensor = ggml_cann_create_tensor( - tmp_permute_buffer, ggml_cann_type_mapping(dst->type), - ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, - GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + aclTensor * tmp_permute_tensor = + ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); - int64_t permute_dim[] = {0, 2, 1}; + int64_t permute_dim[] = { 0, 2, 1 }; if (src1->type != dst->type) { aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); } else { - aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, - 3); + aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, 3); } // number of times the kernel moves in W dimension const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; - size_t offset; - void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; + size_t offset; + void * cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; // memory copy with offset to restore 1D im2col from 2d if (IC > 1) { - offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); + offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); size_t size_cpy = KH * KW * ggml_type_size(dst->type); for (int c = 0; c < IC; c++) { - cur_permute_buffer = (char*)tmp_permute_buffer + offset + - KH * KW * c * ggml_type_size(dst->type); - cur_dst_buffer = (char*)dst->data + - c * KH * KW * n_step_w * ggml_type_size(dst->type); + cur_permute_buffer = (char *) tmp_permute_buffer + offset + KH * KW * c * ggml_type_size(dst->type); + cur_dst_buffer = (char *) dst->data + c * KH * KW * n_step_w * ggml_type_size(dst->type); for (int i = 0; i < n_step_w; i++) { - ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy, - ACL_MEMCPY_DEVICE_TO_DEVICE); - cur_dst_buffer = - (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); - cur_permute_buffer = (char*)cur_permute_buffer + - KH * KW * IC * ggml_type_size(dst->type); + ggml_cann_async_memcpy(ctx, cur_dst_buffer, cur_permute_buffer, size_cpy, ACL_MEMCPY_DEVICE_TO_DEVICE); + cur_dst_buffer = (char *) cur_dst_buffer + KH * KW * ggml_type_size(dst->type); + cur_permute_buffer = (char *) cur_permute_buffer + KH * KW * IC * ggml_type_size(dst->type); } } } else { - offset = KH * KW * n_step_w * - ggml_type_size(dst->type); // equal to ggml_nbytes(dst) - ggml_cann_async_memcpy(ctx, dst->data, (char*)tmp_permute_buffer + offset, offset, - ACL_MEMCPY_DEVICE_TO_DEVICE); + offset = KH * KW * n_step_w * ggml_type_size(dst->type); // equal to ggml_nbytes(dst) + ggml_cann_async_memcpy(ctx, dst->data, (char *) tmp_permute_buffer + offset, offset, + ACL_MEMCPY_DEVICE_TO_DEVICE); } ggml_cann_release_resources(ctx, tmp_permute_tensor); } -void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; // kernel - ggml_tensor* src1 = dst->src[1]; // input +void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // kernel + ggml_tensor * src1 = dst->src[1]; // input GGML_TENSOR_BINARY_OP_LOCALS; // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D // im2col and do post-processing to restore it to 1D. - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; - - const int64_t N = ne13; + const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1; + const int32_t s0 = ((const int32_t *) (dst->op_params))[0]; + const int32_t s1 = is_2D ? ((const int32_t *) (dst->op_params))[1] : 1; + const int32_t p0 = ((const int32_t *) (dst->op_params))[2]; + const int32_t p1 = is_2D ? ((const int32_t *) (dst->op_params))[3] : 1; + const int32_t d0 = ((const int32_t *) (dst->op_params))[4]; + const int32_t d1 = is_2D ? ((const int32_t *) (dst->op_params))[5] : 1; + + const int64_t N = ne13; const int64_t IC = ne12; const int64_t KH = ne01; const int64_t KW = ne00; @@ -1181,9 +1155,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const int64_t n_bytes_factor = is_2D ? 1 : 3; // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor] - aclTensor* acl_src1 = ggml_cann_create_tensor(src1); - int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; - size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; + aclTensor * acl_src1 = ggml_cann_create_tensor(src1); + int64_t tmp_im2col_ne[] = { OW * OH * n_bytes_factor, IC * KH * KW, N }; + size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; tmp_im2col_nb[0] = ggml_type_size(src1->type); for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { @@ -1193,31 +1167,27 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { // Calculate im2col. // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * // dst.elemcount. - ggml_cann_pool_alloc im2col_allocator( - ctx.pool(), - ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); - void* tmp_im2col_buffer = im2col_allocator.get(); - - aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( - tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), - ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, - GGML_MAX_DIMS - 1, ACL_FORMAT_ND); - - std::vector kernel_dims = {KH, KW}; - std::vector dilation_size = {d1, d0}; - std::vector padding_dims = {p1, p0}; - std::vector stride_dims = {s1, s0}; - auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); - auto* dilations = aclCreateIntArray(dilation_size.data(), 2); - auto* paddings = aclCreateIntArray(padding_dims.data(), 2); - auto* strides = aclCreateIntArray(stride_dims.data(), 2); - GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations, - paddings, strides, tmp_im2col_tensor); + ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); + void * tmp_im2col_buffer = im2col_allocator.get(); + + aclTensor * tmp_im2col_tensor = + ggml_cann_create_tensor(tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), + tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + std::vector kernel_dims = { KH, KW }; + std::vector dilation_size = { d1, d0 }; + std::vector padding_dims = { p1, p0 }; + std::vector stride_dims = { s1, s0 }; + auto * kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto * dilations = aclCreateIntArray(dilation_size.data(), 2); + auto * paddings = aclCreateIntArray(padding_dims.data(), 2); + auto * strides = aclCreateIntArray(stride_dims.data(), 2); + GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations, paddings, strides, tmp_im2col_tensor); // Cast if dst is f16. - aclTensor* tmp_cast_tensor = nullptr; + aclTensor * tmp_cast_tensor = nullptr; ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); - void* tmp_cast_buffer = nullptr; + void * tmp_cast_buffer = nullptr; if (src1->type != dst->type) { tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); tmp_cast_buffer = tmp_cast_allocator.get(); @@ -1227,26 +1197,22 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1]; } - tmp_cast_tensor = ggml_cann_create_tensor( - tmp_cast_buffer, ggml_cann_type_mapping(dst->type), - ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb, - GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + tmp_cast_tensor = + ggml_cann_create_tensor(tmp_cast_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND); aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping(dst->type)); } // post-processing if (is_2D) { - ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, - tmp_im2col_tensor); + ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, tmp_im2col_tensor); } else { - std::vector im2col_op_params = { - KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; - ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, - tmp_im2col_tensor, im2col_op_params); + std::vector im2col_op_params = { KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor }; + ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, tmp_im2col_tensor, im2col_op_params); } - ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor, - kernel_size, dilations, paddings, strides); + ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor, kernel_size, dilations, paddings, + strides); } /** @@ -1262,136 +1228,123 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param ctx The context for the CANN backend operations. * @param acl_src The tensor on which the exponential function will be applied. */ -static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) { +static void aclnn_exp(ggml_backend_cann_context & ctx, aclTensor * acl_src) { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src); } -void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst) { - if(acl_dst == nullptr) { +void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { + if (acl_dst == nullptr) { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCos, acl_src); } else { GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst); } } -void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst) { - if(acl_dst == nullptr) { +void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { + if (acl_dst == nullptr) { GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSin, acl_src); } else { GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst); } } -void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - const ggml_tensor* src = dst->src[0]; +void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src = dst->src[0]; GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int dim = dst->op_params[0]; + const int dim = dst->op_params[0]; const int max_period = dst->op_params[1]; - int half = dim / 2; + int half = dim / 2; - aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_src = ggml_cann_create_tensor(src); // arange: [0, ..., half) - float start = 0; - float stop = half; - float step = 1; + float start = 0; + float stop = half; + float step = 1; int64_t n_elements_arange = half; - int64_t tmp_arange_ne[] = {half}; - size_t tmp_arange_nb[] = {sizeof(dst->type)}; + int64_t tmp_arange_ne[] = { half }; + size_t tmp_arange_nb[] = { sizeof(dst->type) }; ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type)); - void* tmp_arange_buffer = arange_allocator.get(); - aclTensor* tmp_arange_tensor = ggml_cann_create_tensor( - tmp_arange_buffer, ggml_cann_type_mapping(dst->type), - ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb, - GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + void * tmp_arange_buffer = arange_allocator.get(); + aclTensor * tmp_arange_tensor = + ggml_cann_create_tensor(tmp_arange_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange); // freq float freq_param = -logf(max_period) / half; - bool inplace = true; + bool inplace = true; aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace); aclnn_exp(ctx, tmp_arange_tensor); // permute: src [0,1,2,3]->[0,1,3,2] - int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]}; - size_t tmp_permute_nb[GGML_MAX_DIMS]; + int64_t tmp_permute_ne[] = { src->ne[1], src->ne[0], src->ne[2], src->ne[3] }; + size_t tmp_permute_nb[GGML_MAX_DIMS]; tmp_permute_nb[0] = ggml_type_size(src->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; } ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src)); - void* tmp_permute_buffer = permute_allocator.get(); - aclTensor* tmp_permute_tensor = ggml_cann_create_tensor( - tmp_permute_buffer, ggml_cann_type_mapping(src->type), - ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); - int64_t permute_dim[] = {0, 1, 3, 2}; - int64_t num_dims = 4; + void * tmp_permute_buffer = permute_allocator.get(); + aclTensor * tmp_permute_tensor = + ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type), + tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); + int64_t permute_dim[] = { 0, 1, 3, 2 }; + int64_t num_dims = 4; aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims); // timestep * freq - int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2], - src->ne[3]}; - size_t tmp_mul_nb[GGML_MAX_DIMS]; + int64_t tmp_mul_ne[] = { src->ne[1] * half, src->ne[0], src->ne[2], src->ne[3] }; + size_t tmp_mul_nb[GGML_MAX_DIMS]; tmp_mul_nb[0] = ggml_type_size(src->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1]; } - int mul_nelements = - src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; + int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; - ggml_cann_pool_alloc mul_allocator( - ctx.pool(), mul_nelements * ggml_type_size(src->type)); - void* tmp_mul_buffer = mul_allocator.get(); - aclTensor* tmp_mul_tensor = ggml_cann_create_tensor( - tmp_mul_buffer, ggml_cann_type_mapping(src->type), - ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, - ACL_FORMAT_ND); + ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void * tmp_mul_buffer = mul_allocator.get(); + aclTensor * tmp_mul_tensor = + ggml_cann_create_tensor(tmp_mul_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type), + tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor); // cos - ggml_cann_pool_alloc cos_allocator( - ctx.pool(), mul_nelements * ggml_type_size(src->type)); - void* tmp_cos_buffer = cos_allocator.get(); - aclTensor* tmp_cos_tensor = ggml_cann_create_tensor( - tmp_cos_buffer, ggml_cann_type_mapping(dst->type), - ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, - ACL_FORMAT_ND); + ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void * tmp_cos_buffer = cos_allocator.get(); + aclTensor * tmp_cos_tensor = + ggml_cann_create_tensor(tmp_cos_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor); // sin - ggml_cann_pool_alloc sin_allocator( - ctx.pool(), mul_nelements * ggml_type_size(src->type)); - void* tmp_sin_buffer = sin_allocator.get(); - aclTensor* tmp_sin_tensor = ggml_cann_create_tensor( - tmp_sin_buffer, ggml_cann_type_mapping(dst->type), - ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, - ACL_FORMAT_ND); + ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void * tmp_sin_buffer = sin_allocator.get(); + aclTensor * tmp_sin_tensor = + ggml_cann_create_tensor(tmp_sin_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor); // concat - int64_t concat_dim = 3; - aclTensor* acl_dst = ggml_cann_create_tensor(dst); - aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor}; - aclTensorList* tensor_list = aclCreateTensorList(tensors, 2); + int64_t concat_dim = 3; + aclTensor * acl_dst = ggml_cann_create_tensor(dst); + aclTensor * tensors[] = { tmp_cos_tensor, tmp_sin_tensor }; + aclTensorList * tensor_list = aclCreateTensorList(tensors, 2); aclnn_concat(ctx, tensor_list, acl_dst, concat_dim); // release // segmentation fault when delete both tensorList and his elements. - ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor, - tmp_permute_tensor, tmp_mul_tensor, acl_dst); + ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor, tmp_permute_tensor, tmp_mul_tensor, + acl_dst); } /** @@ -1410,8 +1363,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, * @param acl_exp The exponent tensor, each element of which is used to raise * the corresponding element in the destination tensor. */ -static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, - aclTensor* acl_dst, aclTensor* acl_exp) { +static void aclnn_pow_tensor_tensor(ggml_backend_cann_context & ctx, aclTensor * acl_dst, aclTensor * acl_exp) { GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp); } @@ -1436,25 +1388,29 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, * @param step Step size for the exponent increment. * @param dtype Data type for slope tensor. */ -static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer, - float m, int64_t size, float start, float stop, float step, ggml_type dtype){ - aclDataType acl_type = ggml_cann_type_mapping(dtype); - size_t type_size = ggml_type_size(dtype); - - int64_t ne[] = {size}; - size_t nb[] = {type_size}; +static void aclnn_get_slope_inner(ggml_backend_cann_context & ctx, + void * slope_buffer, + float m, + int64_t size, + float start, + float stop, + float step, + ggml_type dtype) { + aclDataType acl_type = ggml_cann_type_mapping(dtype); + size_t type_size = ggml_type_size(dtype); + + int64_t ne[] = { size }; + size_t nb[] = { type_size }; ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size); - void* arange_buffer = arange_allocator.get(); + void * arange_buffer = arange_allocator.get(); - aclTensor* arange_tensor = ggml_cann_create_tensor( - arange_buffer, acl_type, type_size, ne, nb, 1); + aclTensor * arange_tensor = ggml_cann_create_tensor(arange_buffer, acl_type, type_size, ne, nb, 1); aclnn_arange(ctx, arange_tensor, start, stop, step, size); - aclTensor* slope_tensor = ggml_cann_create_tensor( - slope_buffer, acl_type, type_size, ne, nb, 1); + aclTensor * slope_tensor = ggml_cann_create_tensor(slope_buffer, acl_type, type_size, ne, nb, 1); - aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT); + aclScalar * sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor); ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor); @@ -1486,8 +1442,11 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu * @param dtype Data type for slope tensor. * */ -static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head, - void* slope_buffer, float max_bias, ggml_type dtype) { +static void aclnn_get_slope(ggml_backend_cann_context & ctx, + int64_t n_head, + void * slope_buffer, + float max_bias, + ggml_type dtype) { const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); float m0 = powf(2.0f, -(max_bias) / n_head_log2); @@ -1511,9 +1470,8 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head, end = 2 * ((n_head - 1) - n_head_log2) + 1; step = 2; count = n_head - n_head_log2; - aclnn_get_slope_inner( - ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), - m1, count, start, end + 1, step, dtype); + aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step, + dtype); } } @@ -1538,17 +1496,19 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head, * - Write data into dst_ptr using only the shape information of the dst tensor. * - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting. */ -static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask, - ggml_tensor* dst, void* dst_ptr, float max_bias) { - void* slope_buffer = nullptr; - void* bias_buffer = nullptr; +static void aclnn_add_alibi(ggml_backend_cann_context & ctx, + ggml_tensor * mask, + ggml_tensor * dst, + void * dst_ptr, + float max_bias) { + void * slope_buffer = nullptr; + void * bias_buffer = nullptr; if (max_bias > 0.0f) { - int64_t n_heads = dst->ne[2]; + int64_t n_heads = dst->ne[2]; ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float)); slope_buffer = slope_allocator.get(); - ggml_cann_pool_alloc bias_allocator( - ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst)); + ggml_cann_pool_alloc bias_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst)); bias_buffer = bias_allocator.get(); aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32); } @@ -1559,16 +1519,12 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask, // broadcast the mask across rows int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 }; - size_t mask_nb[] = { - mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2], - mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] - }; + size_t mask_nb[] = { mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2], + mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] }; int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 }; - size_t dst_nb[] = { - dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2], - dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] - }; + size_t dst_nb[] = { dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2], + dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] }; // slope is a 1 dim tensor, slope.ne2 == dst.ne2 int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 }; @@ -1578,17 +1534,13 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask, slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1]; } - aclTensor* acl_slope = ggml_cann_create_tensor( - slope_buffer, ACL_FLOAT, sizeof(float), - slope_ne, slope_nb, GGML_MAX_DIMS + 2); - aclTensor* acl_mask = ggml_cann_create_tensor( - mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2); + aclTensor * acl_slope = + ggml_cann_create_tensor(slope_buffer, ACL_FLOAT, sizeof(float), slope_ne, slope_nb, GGML_MAX_DIMS + 2); + aclTensor * acl_mask = ggml_cann_create_tensor(mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2); // write data into dst_ptr using only the shape information of the dst tensor. - aclTensor* acl_dst = ggml_cann_create_tensor( - dst_ptr, ggml_cann_type_mapping(dst->type), - ggml_type_size(dst->type), dst_ne, dst_nb, - GGML_MAX_DIMS + 2); + aclTensor * acl_dst = ggml_cann_create_tensor(dst_ptr, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + dst_ne, dst_nb, GGML_MAX_DIMS + 2); if (max_bias > 0.0f) { int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 }; @@ -1597,9 +1549,8 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask, for (int i = 1; i < GGML_MAX_DIMS + 2; i++) { bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1]; } - aclTensor* bias_tensor = ggml_cann_create_tensor( - bias_buffer, ACL_FLOAT, sizeof(float), - bias_ne, bias_nb, GGML_MAX_DIMS + 2); + aclTensor * bias_tensor = + ggml_cann_create_tensor(bias_buffer, ACL_FLOAT, sizeof(float), bias_ne, bias_nb, GGML_MAX_DIMS + 2); aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor); aclnn_add(ctx, acl_dst, bias_tensor); @@ -1628,17 +1579,16 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) { * @param acl_dst The destination tensor where the softmax results will be * stored. */ -static void aclnn_softmax(ggml_backend_cann_context & ctx, - aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) { +static void aclnn_softmax(ggml_backend_cann_context & ctx, aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) { GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst); } void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; // mask + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; // mask - aclTensor* acl_src0 = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src0 = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); float scale = 1.0f; float max_bias = 0.0f; @@ -1647,12 +1597,11 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) { memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); // input mul scale - aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); + aclScalar * acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0)); - void* src_tensor_buffer = src_tensor_allocator.get(); - aclTensor* softmax_tensor = ggml_cann_create_tensor( - src_tensor_buffer, ggml_cann_type_mapping(src0->type), - ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS); + void * src_tensor_buffer = src_tensor_allocator.get(); + aclTensor * softmax_tensor = ggml_cann_create_tensor(src_tensor_buffer, ggml_cann_type_mapping(src0->type), + ggml_element_size(src0), src0->ne, src0->nb, GGML_MAX_DIMS); aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false); @@ -1684,29 +1633,31 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) { * @param index The index tensor specifying the indices to select from the source tensor. * @param type The data type of the source and destination tensors. */ -static void aclnn_index_select_4d(ggml_backend_cann_context& ctx, - void* src_buffer,int64_t* src_ne, size_t* src_nb, - void* dst_buffer, int64_t* dst_ne, size_t* dst_nb, - ggml_tensor* index, ggml_type type) { +static void aclnn_index_select_4d(ggml_backend_cann_context & ctx, + void * src_buffer, + int64_t * src_ne, + size_t * src_nb, + void * dst_buffer, + int64_t * dst_ne, + size_t * dst_nb, + ggml_tensor * index, + ggml_type type) { for (int64_t i = 0; i < src_ne[3]; i++) { for (int64_t j = 0; j < src_ne[2]; j++) { // src - aclTensor* acl_src_tensor = ggml_cann_create_tensor( - (char*)src_buffer + i * src_nb[3] + j * src_nb[2], - ggml_cann_type_mapping(type), ggml_type_size(type), - src_ne, src_nb, 2); + aclTensor * acl_src_tensor = + ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2], + ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2); // index - aclTensor* acl_index = ggml_cann_create_tensor( - (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1], - ggml_cann_type_mapping(index->type), ggml_element_size(index), - index->ne, index->nb, 1); + aclTensor * acl_index = ggml_cann_create_tensor( + (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1], + ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1); // out - aclTensor* acl_out = ggml_cann_create_tensor( - (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2], - ggml_cann_type_mapping(type), ggml_type_size(type), - dst_ne, dst_nb, 2); + aclTensor * acl_out = + ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2], + ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2); GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out); ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out); } @@ -1733,162 +1684,154 @@ static void aclnn_index_select_4d(ggml_backend_cann_context& ctx, * @param index The index tensor specifying target positions in the destination tensor. * @param type The data type of the source and destination tensors. */ -static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx, - void* src_buffer,int64_t* src_ne, size_t* src_nb, - void* dst_buffer, int64_t* dst_ne, size_t* dst_nb, - ggml_tensor* index, ggml_type type) { +static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx, + void * src_buffer, + int64_t * src_ne, + size_t * src_nb, + void * dst_buffer, + int64_t * dst_ne, + size_t * dst_nb, + ggml_tensor * index, + ggml_type type) { for (int64_t i = 0; i < src_ne[3]; i++) { for (int64_t j = 0; j < src_ne[2]; j++) { // src - aclTensor* acl_src_tensor = ggml_cann_create_tensor( - (char*)src_buffer + i * src_nb[3] + j * src_nb[2], - ggml_cann_type_mapping(type), ggml_type_size(type), - src_ne, src_nb, 2); + aclTensor * acl_src_tensor = + ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2], + ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2); // index - aclTensor* acl_index = ggml_cann_create_tensor( - (char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1], - ggml_cann_type_mapping(index->type), ggml_element_size(index), - index->ne, index->nb, 1); + aclTensor * acl_index = ggml_cann_create_tensor( + (char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1], + ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1); // out - aclTensor* acl_out = ggml_cann_create_tensor( - (char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2], - ggml_cann_type_mapping(type), ggml_type_size(type), - dst_ne, dst_nb, 2); + aclTensor * acl_out = + ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2], + ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor); ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out); } } } -void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; // src - ggml_tensor* src1 = dst->src[1]; // index +void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // src + ggml_tensor * src1 = dst->src[1]; // index GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); switch (src0->type) { case GGML_TYPE_F16: case GGML_TYPE_F32: - if(src0->type == dst->type) { - aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); + if (src0->type == dst->type) { + aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, + dst->type); } else { - aclTensor* acl_src0 = ggml_cann_create_tensor(src0); - ggml_cann_pool_alloc src_buffer_allocator( - ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst)); - void* src_trans_buffer = src_buffer_allocator.get(); - size_t src_trans_nb[GGML_MAX_DIMS]; + aclTensor * acl_src0 = ggml_cann_create_tensor(src0); + ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst)); + void * src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; src_trans_nb[0] = dst->nb[0]; for (int i = 1; i < GGML_MAX_DIMS; i++) { src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; } - aclTensor* src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), - src0->ne, src_trans_nb, GGML_MAX_DIMS); + aclTensor * src_trans_tensor = + ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS); aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type)); - aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); + aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1, + dst->type); ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor); } break; - case GGML_TYPE_Q8_0: { - // add 1 dim for bcast mul. - size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], - dequant_nb[GGML_MAX_DIMS + 1]; - int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], - *dequant_ne; - int64_t scale_offset = 0; - // [3,4,5,64] -> [3,4,5,2,32] - weight_ne[0] = QK8_0; - weight_ne[1] = src0->ne[0] / QK8_0; - weight_nb[0] = sizeof(int8_t); - weight_nb[1] = weight_nb[0] * weight_ne[0]; - for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { - weight_ne[i] = src0->ne[i - 1]; - weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1]; - } - // [3,4,5,64] -> [3,4,5,2,1] - scale_ne[0] = 1; - scale_ne[1] = src0->ne[0] / QK8_0; - scale_nb[0] = sizeof(uint16_t); - scale_nb[1] = scale_nb[0] * scale_ne[0]; - for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { - scale_ne[i] = src0->ne[i - 1]; - scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1]; - } - // [3,4,5,64] -> [3,4,5,2,32] - dequant_ne = weight_ne; - dequant_nb[0] = ggml_type_size(dst->type); - for (int i = 1; i < GGML_MAX_DIMS + 1; i++) { - dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1]; - } - scale_offset = ggml_nelements(src0) * sizeof(int8_t); - ggml_cann_pool_alloc dequant_buffer_allocator( - ctx.pool(), ggml_nelements(src0) * ggml_type_size(dst->type)); - aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, - GGML_MAX_DIMS + 1); - aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb, - GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); - aclTensor* dequant_tensor = ggml_cann_create_tensor( - dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), - dequant_ne, dequant_nb, GGML_MAX_DIMS + 1); - aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); - dequant_nb[0] = ggml_type_size(dst->type); - dequant_ne = src0->ne; - for (int i = 1; i < GGML_MAX_DIMS; i++) { - dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1]; - } - aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), - dequant_ne, dequant_nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); + case GGML_TYPE_Q8_0: + { + // add 1 dim for bcast mul. + size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1]; + int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne; + int64_t scale_offset = 0; + // [3,4,5,64] -> [3,4,5,2,32] + weight_ne[0] = QK8_0; + weight_ne[1] = src0->ne[0] / QK8_0; + weight_nb[0] = sizeof(int8_t); + weight_nb[1] = weight_nb[0] * weight_ne[0]; + for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { + weight_ne[i] = src0->ne[i - 1]; + weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1]; + } + // [3,4,5,64] -> [3,4,5,2,1] + scale_ne[0] = 1; + scale_ne[1] = src0->ne[0] / QK8_0; + scale_nb[0] = sizeof(uint16_t); + scale_nb[1] = scale_nb[0] * scale_ne[0]; + for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { + scale_ne[i] = src0->ne[i - 1]; + scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1]; + } + // [3,4,5,64] -> [3,4,5,2,32] + dequant_ne = weight_ne; + dequant_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS + 1; i++) { + dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1]; + } + scale_offset = ggml_nelements(src0) * sizeof(int8_t); + ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(), + ggml_nelements(src0) * ggml_type_size(dst->type)); + aclTensor * acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t), weight_ne, + weight_nb, GGML_MAX_DIMS + 1); + aclTensor * acl_scale_tensor = + ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb, + GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); + aclTensor * dequant_tensor = + ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1); + aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); + dequant_nb[0] = ggml_type_size(dst->type); + dequant_ne = src0->ne; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1]; + } + aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne, + dst->nb, src1, dst->type); - ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); - break; - } + ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); + break; + } default: GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS"); break; } } -void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; // src - ggml_tensor* src1 = dst->src[1]; // index +void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // src + ggml_tensor * src1 = dst->src[1]; // index switch (dst->type) { - case GGML_TYPE_F32: { - aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); - break; - } - case GGML_TYPE_F16: { - aclTensor* acl_src0 = ggml_cann_create_tensor(src0); - ggml_cann_pool_alloc src_buffer_allocator( - ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t)); - void* src_trans_buffer = src_buffer_allocator.get(); - size_t src_trans_nb[GGML_MAX_DIMS]; - src_trans_nb[0] = sizeof(uint16_t); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + case GGML_TYPE_F32: + { + aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type); + break; + } + case GGML_TYPE_F16: + { + aclTensor * acl_src0 = ggml_cann_create_tensor(src0); + ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t)); + void * src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor * src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type)); + aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1, + dst->type); + ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor); + break; } - aclTensor* src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), - src0->ne, src_trans_nb, GGML_MAX_DIMS); - aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type)); - aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, - dst->data, dst->ne, dst->nb, - src1, dst->type); - ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor); - break; - } default: GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS"); break; @@ -1910,12 +1853,13 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param repeats The number of times each element will be repeated. * @param output_size The size of the output tensor. */ -static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, - aclTensor* acl_src, aclTensor* acl_dst, - int64_t dim, int64_t repeats, - int64_t output_size) { - GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, - output_size, acl_dst); +static void aclnn_repeat_interleave(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_dst, + int64_t dim, + int64_t repeats, + int64_t output_size) { + GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, output_size, acl_dst); } /** @@ -1930,10 +1874,9 @@ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, * @param dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, - ggml_tensor* dst) { - ggml_tensor* weight = dst->src[0]; // weight - ggml_tensor* input = dst->src[1]; // input +static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * weight = dst->src[0]; // weight + ggml_tensor * input = dst->src[1]; // input // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. @@ -1948,27 +1891,21 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, } } - aclTensor* acl_input_tensor = - ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); - int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], - bcast_weight_ne[2], bcast_weight_ne[3], - bcast_weight_ne[4], bcast_weight_ne[5]}; - size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], - bcast_weight_nb[2], bcast_weight_nb[3], - bcast_weight_nb[4], bcast_weight_nb[5]}; - aclTensor* acl_weight_tensor; + aclTensor * acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); + int64_t transpose_ne[] = { bcast_weight_ne[1], bcast_weight_ne[0], bcast_weight_ne[2], + bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5] }; + size_t transpose_nb[] = { bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2], + bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5] }; + aclTensor * acl_weight_tensor; // Only check env once. static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on")); if (weight_to_nz && is_matmul_weight(weight)) { - acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ); + acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ); } else { - acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND); + acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND); } - aclTensor* acl_dst = - ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); switch (n_dims) { case 2: @@ -2000,11 +1937,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, * @param dst The destination tensor where the result of the matrix * multiplication will be stored. */ -static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, - ggml_tensor* dst, - const enum ggml_type type) { - ggml_tensor* src0 = dst->src[0]; // weight - ggml_tensor* src1 = dst->src[1]; // input +static void ggml_cann_mul_mat_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst, const enum ggml_type type) { + ggml_tensor * src0 = dst->src[0]; // weight + ggml_tensor * src1 = dst->src[1]; // input // The shape of the weight is NCHW. // Matrix multiplication uses HW dims. @@ -2018,56 +1953,52 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, } else { GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); } - float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; + float weight_nb[] = { src0->ne[0] * weight_elem_size, weight_elem_size }; size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; - size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; + size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; // scale stored at the end of weight. Also need transpose. size_t scale_elem_size = sizeof(uint16_t); - size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, - scale_elem_size}; - size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; - char* scale_offset = (char*)src0->data + weight_size; + size_t scale_nb[] = { src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size }; + size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; + char * scale_offset = (char *) src0->data + weight_size; // input - size_t input_elem_size = sizeof(uint16_t); - int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; - size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; - size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; + size_t input_elem_size = sizeof(uint16_t); + int64_t input_ne[] = { src1->ne[0], src1->ne[1] }; + size_t input_nb[] = { input_elem_size, input_ne[0] * input_elem_size }; + size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; ggml_cann_pool_alloc input_alloctor(ctx.pool()); - void* input_buffer = src1->data; + void * input_buffer = src1->data; // case in if (src1->type != GGML_TYPE_F16) { - aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); - input_buffer = - input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); + aclTensor * acl_src1_tensor = ggml_cann_create_tensor(src1); + input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); - int64_t* input_cast_ne = src1->ne; - size_t input_cast_nb[GGML_MAX_DIMS]; + int64_t * input_cast_ne = src1->ne; + size_t input_cast_nb[GGML_MAX_DIMS]; input_cast_nb[0] = sizeof(uint16_t); for (int i = 1; i < GGML_MAX_DIMS; i++) { input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; } - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, - input_cast_nb, GGML_MAX_DIMS); + aclTensor * acl_input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, input_elem_size, + input_cast_ne, input_cast_nb, GGML_MAX_DIMS); aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor); } // output - size_t output_elem_size = sizeof(uint16_t); - size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size}; + size_t output_elem_size = sizeof(uint16_t); + size_t output_nb[] = { output_elem_size, dst->ne[0] * output_elem_size }; ggml_cann_pool_alloc output_allocator(ctx.pool()); - void* output_buffer = - output_allocator.alloc(ggml_nelements(dst) * output_elem_size); - size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; + void * output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size); + size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; // aclnn - int64_t max_elem_size = 65535; - int64_t split_size = (src0->ne[1] / max_elem_size) + 1; + int64_t max_elem_size = 65535; + int64_t split_size = (src0->ne[1] / max_elem_size) + 1; ggml_cann_pool_alloc workspace_allocator(ctx.pool()); for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { @@ -2077,71 +2008,57 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, int64_t batch1 = (n1 * src1->ne[2]) + c1; int64_t batch0 = (n0 * src0->ne[2]) + c0; - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, - input_elem_size, input_ne, input_nb, 2); + aclTensor * acl_input_tensor = ggml_cann_create_tensor((char *) input_buffer + batch1 * input_stride, + ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2); // first split int64_t weight_ne_offset = 0; - int64_t weight_ne[2] = { - max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, - src0->ne[0]}; - int64_t scale_ne_offset = 0; - int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; + int64_t weight_ne[2] = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0] }; + int64_t scale_ne_offset = 0; + int64_t scale_ne[2] = { weight_ne[0], weight_ne[1] / QK8_0 }; int64_t output_ne_offset = 0; - int64_t output_ne[2] = {weight_ne[0], dst->ne[1]}; - - aclTensor* acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), weight_elem_size, weight_ne, - weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); - aclTensor* acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, - scale_ne_offset); - aclTensor* acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, - output_ne_offset); + int64_t output_ne[2] = { weight_ne[0], dst->ne[1] }; + + aclTensor * acl_weight_tensor = + ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type), + weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); + aclTensor * acl_scale_tensor = + ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne, + scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset); + aclTensor * acl_output_tensor = + ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size, + output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset); int64_t antiquantGroupSize = 0; if (src0->ne[0] > QK8_0) { antiquantGroupSize = QK8_0; } - GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, - acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, antiquantGroupSize, - acl_output_tensor); + GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, acl_weight_tensor, + acl_scale_tensor, nullptr, nullptr, nullptr, nullptr, antiquantGroupSize, + acl_output_tensor); ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); // other splits for (int64_t split = 1; split < split_size; split++) { - weight_ne_offset += - weight_elem_size * weight_ne[0] * weight_ne[1]; - weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] - ? src0->ne[1] - (max_elem_size * split) - : max_elem_size; + weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1]; + weight_ne[0] = + max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size; scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1]; scale_ne[0] = weight_ne[0]; - output_ne_offset += - output_elem_size * output_ne[0] * output_ne[1]; + output_ne_offset += output_elem_size * output_ne[0] * output_ne[1]; output_ne[0] = weight_ne[0]; - acl_weight_tensor = ggml_cann_create_tensor( - (char*)src0->data + batch0 * weight_stride, - ggml_cann_type_mapping(type), weight_elem_size, weight_ne, - weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); - acl_scale_tensor = ggml_cann_create_tensor( - scale_offset + batch0 * scale_stride, ACL_FLOAT16, - scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, - scale_ne_offset); - acl_output_tensor = ggml_cann_create_tensor( - (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, - output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, - output_ne_offset); - GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, - acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, antiquantGroupSize, - acl_output_tensor); + acl_weight_tensor = + ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type), + weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); + acl_scale_tensor = + ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, + scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset); + acl_output_tensor = + ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset); + GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor, acl_weight_tensor, + acl_scale_tensor, nullptr, nullptr, nullptr, nullptr, antiquantGroupSize, + acl_output_tensor); ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor); } @@ -2151,24 +2068,23 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, // cast out if (dst->type != GGML_TYPE_F16) { - int64_t* output_cast_ne = dst->ne; - size_t output_cast_nb[GGML_MAX_DIMS]; + int64_t * output_cast_ne = dst->ne; + size_t output_cast_nb[GGML_MAX_DIMS]; output_cast_nb[0] = sizeof(uint16_t); for (int i = 1; i < GGML_MAX_DIMS; i++) { output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; } - aclTensor* acl_output_tensor = ggml_cann_create_tensor( - output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne, - output_cast_nb, GGML_MAX_DIMS); - aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); + aclTensor * acl_output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size, + output_cast_ne, output_cast_nb, GGML_MAX_DIMS); + aclTensor * acl_dst_tensor = ggml_cann_create_tensor(dst); aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor); } } -void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) { const enum ggml_type type = dst->src[0]->type; switch (type) { case GGML_TYPE_F32: @@ -2201,10 +2117,13 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param dims An array specifying the dimensions along which elements are * shifted. */ -static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst, int64_t* shifts, int64_t* dims) { - aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1); - aclIntArray* acl_dims = aclCreateIntArray(dims, 1); +static void aclnn_roll(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_dst, + int64_t * shifts, + int64_t * dims) { + aclIntArray * acl_shifts = aclCreateIntArray(shifts, 1); + aclIntArray * acl_dims = aclCreateIntArray(dims, 1); GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts, acl_dims, acl_dst); ggml_cann_release_resources(ctx, acl_shifts, acl_dims); } @@ -2222,12 +2141,14 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param index_num The number of positions specified in the index array. * @param value The scalar value used to fill the specified positions. */ -static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, - aclTensor* acl_src, int64_t dim, - int64_t* index, int64_t index_num, - float value) { - aclIntArray* acl_index = aclCreateIntArray(index, index_num); - aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); +static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + int64_t dim, + int64_t * index, + int64_t index_num, + float value) { + aclIntArray * acl_index = aclCreateIntArray(index, index_num); + aclScalar * acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index, acl_value); ggml_cann_release_resources(ctx, acl_index, acl_value); } @@ -2262,85 +2183,82 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, * @param is_neox Whether to use Neox-style repeat strategy * (dim expansion vs repeat_interleave). */ -static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, - float* corr_dims, float ext_factor, - float theta_scale, float freq_scale, - float attn_factor, bool is_neox) { - ggml_tensor* src0 = dst->src[0]; // input - ggml_tensor* src1 = dst->src[1]; // position - ggml_tensor* src2 = dst->src[2]; // freq_factors - - if(src2 == nullptr && ctx.rope_cache.cached - && ctx.rope_cache.ext_factor == ext_factor - && ctx.rope_cache.theta_scale == theta_scale - && ctx.rope_cache.freq_scale == freq_scale - && ctx.rope_cache.attn_factor == attn_factor - && ctx.rope_cache.is_neox == is_neox) { +static void aclnn_cache_init(ggml_backend_cann_context & ctx, + ggml_tensor * dst, + float * corr_dims, + float ext_factor, + float theta_scale, + float freq_scale, + float attn_factor, + bool is_neox) { + ggml_tensor * src0 = dst->src[0]; // input + ggml_tensor * src1 = dst->src[1]; // position + ggml_tensor * src2 = dst->src[2]; // freq_factors + + if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.ext_factor == ext_factor && + ctx.rope_cache.theta_scale == theta_scale && ctx.rope_cache.freq_scale == freq_scale && + ctx.rope_cache.attn_factor == attn_factor && ctx.rope_cache.is_neox == is_neox) { // use cache. return; } int64_t theta_scale_length = src0->ne[0] / 2; - int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1}; - size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float), - theta_scale_length * sizeof(float)}; + int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 }; + size_t theta_scale_nb[] = { sizeof(float), sizeof(float), sizeof(float), theta_scale_length * sizeof(float) }; GGML_ASSERT(src1->type == GGML_TYPE_I32); int64_t position_length = src1->ne[0]; - int64_t position_ne[] = {1, 1, position_length, 1}; - size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), - sizeof(int32_t) * position_length}; + int64_t position_ne[] = { 1, 1, position_length, 1 }; + size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length }; - int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1}; - size_t theta_nb[GGML_MAX_DIMS]; + int64_t theta_ne[] = { theta_scale_length, 1, position_length, 1 }; + size_t theta_nb[GGML_MAX_DIMS]; theta_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; } // theta_scale arange, [0,1,...,ne00/2 - 1] - aclTensor* acl_theta_scale_tensor = nullptr; + aclTensor * acl_theta_scale_tensor = nullptr; // cache theta scale if (ctx.rope_cache.theta_scale_length != theta_scale_length || // theta_scale and freq_scale should not change during the current token inference process, // so we can directly use == here instead of comparing the absolute difference. - ctx.rope_cache.theta_scale != theta_scale || - ctx.rope_cache.freq_scale != freq_scale) { - + ctx.rope_cache.theta_scale != theta_scale || ctx.rope_cache.freq_scale != freq_scale) { ctx.rope_cache.theta_scale_length = theta_scale_length; if (ctx.rope_cache.theta_scale_cache != nullptr) { ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache)); } - ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), + ACL_MEM_MALLOC_HUGE_FIRST)); - acl_theta_scale_tensor = - ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float), - theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - float start = 0; - float step = 1; - float stop = theta_scale_length; + float start = 0; + float step = 1; + float stop = theta_scale_length; float n_elements = theta_scale_length; aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements); ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool()); - aclTensor* acl_yarn_ramp_tensor = nullptr; + aclTensor * acl_yarn_ramp_tensor = nullptr; if (ext_factor != 0) { // -rope_yarn_ramp // const float y = (i0 / 2 - low) / MAX(0.001f, high - low); // return MIN(1, MAX(0, y)) - 1; yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float)); - void* yarn_ramp_buffer = yarn_ramp_allocator.get(); - acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), - theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - float zero_value = 0, one_value = 1; - float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]); - aclScalar* low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT); - aclScalar* zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT); - aclScalar* one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT); - aclScalar* denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT); - aclScalar* ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT); + void * yarn_ramp_buffer = yarn_ramp_allocator.get(); + acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, + theta_scale_nb, GGML_MAX_DIMS); + float zero_value = 0, one_value = 1; + float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]); + aclScalar * low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT); + aclScalar * zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT); + aclScalar * one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT); + aclScalar * denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT); + aclScalar * ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor, low, one, acl_yarn_ramp_tensor); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor, denom_safe); @@ -2357,9 +2275,9 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, // // we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse // cache freq_scale + (freq_scale - 1) * ramp_mix - float freq_scale_1 = freq_scale - 1; - aclScalar* freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT); - aclScalar* freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT); + float freq_scale_1 = freq_scale - 1; + aclScalar * freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT); + aclScalar * freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, freq_scale_1_sc); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor, freq_scale_sc, one); @@ -2367,9 +2285,8 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, } // power - aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, - acl_theta_scale_tensor); + aclScalar * acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); + GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor, acl_theta_scale_tensor); if (ext_factor != 0) { aclnn_mul(ctx, acl_theta_scale_tensor, acl_yarn_ramp_tensor); @@ -2380,22 +2297,20 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_cann_release_resources(ctx, acl_yarn_ramp_tensor, acl_theta_scale); } else { // use cache - acl_theta_scale_tensor = - ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float), - theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); } ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool()); // freq_factors if (src2) { freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float)); - void* freq_fac_res_ptr = freq_fac_res_allocator.get(); - aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( - src2->data, ggml_cann_type_mapping(src2->type), - ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); - aclTensor* acl_freq_fac_res_tensor = ggml_cann_create_tensor( - freq_fac_res_ptr, ACL_FLOAT, sizeof(float), - theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + void * freq_fac_res_ptr = freq_fac_res_allocator.get(); + aclTensor * acl_freq_factors_tensor = + ggml_cann_create_tensor(src2->data, ggml_cann_type_mapping(src2->type), ggml_type_size(src2->type), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); + aclTensor * acl_freq_fac_res_tensor = ggml_cann_create_tensor(freq_fac_res_ptr, ACL_FLOAT, sizeof(float), + theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS); aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, acl_freq_fac_res_tensor); std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor); ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor); @@ -2411,42 +2326,37 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache)); } int64_t repeat_theta_length = theta_scale_length * position_length * 2; - ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); - ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK( + aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); + ACL_CHECK( + aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST)); } // position - aclTensor* acl_position_tensor = ggml_cann_create_tensor( - src1->data, ggml_cann_type_mapping(src1->type), - ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); + aclTensor * acl_position_tensor = + ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne, + position_nb, GGML_MAX_DIMS); // power * position - int64_t theta_length = theta_scale_length * position_length; - ggml_cann_pool_alloc theta_allocator(ctx.pool(), - theta_length * sizeof(float)); - void* theta_buffer = theta_allocator.get(); + int64_t theta_length = theta_scale_length * position_length; + ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float)); + void * theta_buffer = theta_allocator.get(); - aclTensor* acl_theta_tensor = - ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), - theta_ne, theta_nb, GGML_MAX_DIMS); - aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, - acl_theta_tensor); + aclTensor * acl_theta_tensor = + ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, acl_theta_tensor); // sin/cos - ggml_cann_pool_alloc sin_allocator(ctx.pool(), - theta_length * sizeof(float)); - void* sin_buffer = sin_allocator.get(); - aclTensor* acl_sin_tensor = ggml_cann_create_tensor( - sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); + ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float)); + void * sin_buffer = sin_allocator.get(); + aclTensor * acl_sin_tensor = + ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor); - ggml_cann_pool_alloc cos_allocator(ctx.pool(), - theta_length * sizeof(float)); - void* cos_buffer = cos_allocator.get(); - aclTensor* acl_cos_tensor = ggml_cann_create_tensor( - cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, - GGML_MAX_DIMS, ACL_FORMAT_ND); + ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float)); + void * cos_buffer = cos_allocator.get(); + aclTensor * acl_cos_tensor = + ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND); aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor); if (ext_factor != 0) { @@ -2459,81 +2369,79 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true); } - int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; - size_t sin_reshape_nb[GGML_MAX_DIMS]; + int64_t sin_reshape_ne[4] = { src0->ne[0], 1, src0->ne[2], 1 }; + size_t sin_reshape_nb[GGML_MAX_DIMS]; sin_reshape_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } - aclTensor* acl_sin_repeat_tensor = - ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), - sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); - aclTensor* acl_cos_repeat_tensor = - ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), - sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclTensor * acl_sin_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclTensor * acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); // repeat if (is_neox) { - int64_t repeatsArray[] = {1, 1, 1, 2}; + int64_t repeatsArray[] = { 1, 1, 1, 2 }; aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray); aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray); } else { int64_t num_repeats = 2; - int64_t dim = 3; + int64_t dim = 3; int64_t output_size = theta_scale_length * num_repeats; - aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim, - num_repeats, output_size); - aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim, - num_repeats, output_size); + aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim, num_repeats, output_size); + aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim, num_repeats, output_size); } // Other layers use cache except first layer. - ctx.rope_cache.cached = true; - ctx.rope_cache.ext_factor = ext_factor; + ctx.rope_cache.cached = true; + ctx.rope_cache.ext_factor = ext_factor; ctx.rope_cache.theta_scale = theta_scale; - ctx.rope_cache.freq_scale = freq_scale; + ctx.rope_cache.freq_scale = freq_scale; ctx.rope_cache.attn_factor = attn_factor; - ctx.rope_cache.is_neox = is_neox; + ctx.rope_cache.is_neox = is_neox; - ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, - acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor, - acl_cos_repeat_tensor); + ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor, acl_theta_tensor, acl_sin_tensor, + acl_sin_repeat_tensor, acl_cos_tensor, acl_cos_repeat_tensor); } #ifdef __cplusplus extern "C" { #endif -aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize( - const aclTensor* x, const aclTensor* cos, const aclTensor* sin, - int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize, - aclOpExecutor** executor); -aclnnStatus aclnnRotaryPositionEmbedding(void* workspace, - uint64_t workspaceSize, - aclOpExecutor* executor, - aclrtStream stream); +aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(const aclTensor * x, + const aclTensor * cos, + const aclTensor * sin, + int64_t mode, + const aclTensor * yOut, + uint64_t * workspaceSize, + aclOpExecutor ** executor); +aclnnStatus aclnnRotaryPositionEmbedding(void * workspace, + uint64_t workspaceSize, + aclOpExecutor * executor, + aclrtStream stream); #ifdef __cplusplus } #endif -void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; // input +void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // input // param - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; // const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t*)dst->op_params)[1]; - const int mode = ((int32_t*)dst->op_params)[2]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; // const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_ctx_orig = ((int32_t*)dst->op_params)[4]; + const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; GGML_TENSOR_UNARY_OP_LOCALS - memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); // TODO: n_dims <= ne0 GGML_ASSERT(n_dims == ne0); @@ -2542,123 +2450,111 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { const float theta_scale = powf(freq_base, -2.0f / n_dims); float corr_dims[2]; - ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, - beta_slow, corr_dims); + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims); const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; // init ctx.rope_cos/rope_sin cache - aclnn_cache_init(ctx, dst, corr_dims, ext_factor, - theta_scale, freq_scale, attn_factor, is_neox); + aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox); - int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1}; - size_t sin_reshape_nb[GGML_MAX_DIMS]; + int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 }; + size_t sin_reshape_nb[GGML_MAX_DIMS]; sin_reshape_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; } - aclTensor* acl_sin_reshape_tensor = - ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), - sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); - aclTensor* acl_cos_reshape_tensor = - ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), - sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclTensor * acl_sin_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclTensor * acl_cos_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); - aclTensor* acl_src = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); #ifdef ASCEND_310P // Special ROPE operation for 310P // roll input - void* input_roll_buffer; - aclTensor* acl_minus_one_tensor; - void* minus_one_scale_buffer = nullptr; + void * input_roll_buffer; + aclTensor * acl_minus_one_tensor; + void * minus_one_scale_buffer = nullptr; ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); - ggml_cann_pool_alloc minus_one_scale_allocator( - ctx.pool(), sizeof(float) * src0->ne[0]); + ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), sizeof(float) * src0->ne[0]); if (!is_neox) { // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] - input_roll_buffer = roll_allocator.get(); - int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), - src0->ne[2], src0->ne[3]}; - size_t input_roll_nb[GGML_MAX_DIMS]; + input_roll_buffer = roll_allocator.get(); + int64_t input_roll_ne[4] = { 2, src0->ne[1] * (src0->ne[0] / 2), src0->ne[2], src0->ne[3] }; + size_t input_roll_nb[GGML_MAX_DIMS]; input_roll_nb[0] = ggml_type_size(src0->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; } - aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( - input_roll_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), input_roll_ne, input_roll_nb, - GGML_MAX_DIMS); - aclTensor* acl_input_tensor = ggml_cann_create_tensor( - src0->data, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), input_roll_ne, input_roll_nb, - GGML_MAX_DIMS); - - int64_t shifts[] = {1}; - int64_t dims[] = {3}; + aclTensor * acl_input_roll_tensor = + ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type), + input_roll_ne, input_roll_nb, GGML_MAX_DIMS); + aclTensor * acl_input_tensor = + ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type), + input_roll_ne, input_roll_nb, GGML_MAX_DIMS); + + int64_t shifts[] = { 1 }; + int64_t dims[] = { 3 }; aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor); // init [-1, 1, -1, 1, ...] minus_one_scale_buffer = minus_one_scale_allocator.get(); - int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; - size_t minus_one_nb[GGML_MAX_DIMS]; + int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 }; + size_t minus_one_nb[GGML_MAX_DIMS]; minus_one_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; } - acl_minus_one_tensor = aclnn_values( - ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], - minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1); - int64_t dim = 3; - int64_t* index = new int64_t[src0->ne[0]]; + acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne, + GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1); + int64_t dim = 3; + int64_t * index = new int64_t[src0->ne[0]]; for (int i = 0; i < src0->ne[0]; i++) { index[i] = i / 2 * 2; } int64_t index_num = src0->ne[0]; - float value = -1; - aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, - index_num, value); + float value = -1; + aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, index_num, value); } else { // roll input: [q0,q1,q2,...] -> // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] input_roll_buffer = roll_allocator.get(); - aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( - input_roll_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); - aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0); + aclTensor * acl_input_roll_tensor = + ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type), + src0->ne, src0->nb, GGML_MAX_DIMS); + aclTensor * acl_input_tensor = ggml_cann_create_tensor(src0); - int64_t shifts[] = {src0->ne[0] / 2}; - int64_t dims[] = {3}; + int64_t shifts[] = { src0->ne[0] / 2 }; + int64_t dims[] = { 3 }; aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor); // init [-1, -1, -1, 1, 1,1,...] - minus_one_scale_buffer = minus_one_scale_allocator.get(); - int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; - size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_scale_buffer = minus_one_scale_allocator.get(); + int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 }; + size_t minus_one_nb[GGML_MAX_DIMS]; minus_one_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; } - acl_minus_one_tensor = aclnn_values( - ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], - minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1); + acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne, + GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1); // -1 * first half - int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; - size_t first_half_nb[GGML_MAX_DIMS]; + int64_t first_half_ne[4] = { src0->ne[0] / 2, 1, 1, 1 }; + size_t first_half_nb[GGML_MAX_DIMS]; first_half_nb[0] = sizeof(float); for (int i = 1; i < GGML_MAX_DIMS; i++) { first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; } - aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( - minus_one_scale_buffer, ACL_FLOAT, sizeof(float), first_half_ne, - first_half_nb, GGML_MAX_DIMS); - bool inplace = true; - float scale = -1; + aclTensor * acl_first_half_tensor = ggml_cann_create_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float), + first_half_ne, first_half_nb, GGML_MAX_DIMS); + bool inplace = true; + float scale = -1; aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); ggml_cann_release_resources(ctx, acl_first_half_tensor); } @@ -2667,30 +2563,27 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { GGML_ASSERT(n_dims == src0->ne[0]); // input * scale - ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), - ggml_nbytes(src0)); - void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); - size_t input_nb[GGML_MAX_DIMS]; + ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0)); + void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); + size_t input_nb[GGML_MAX_DIMS]; input_nb[0] = ggml_type_size(src0->type); for (int i = 1; i < GGML_MAX_DIMS; i++) { input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; } - aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor( - input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); - aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor( - input_roll_buffer, ggml_cann_type_mapping(src0->type), - ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + aclTensor * acl_input_roll_mul_scale_tensor = + ggml_cann_create_tensor(input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + aclTensor * acl_input_roll_reshape_tensor = + ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type), + src0->ne, input_nb, GGML_MAX_DIMS); - aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, - acl_input_roll_mul_scale_tensor); + aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, acl_input_roll_mul_scale_tensor); // output - void* output_fp32_buffer; + void * output_fp32_buffer; if (src0->type == GGML_TYPE_F32) { aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor); - aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, - acl_sin_reshape_tensor); + aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor); aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst); // TODO: ne0 != n_dims in mode2 } else if (src0->type == GGML_TYPE_F16) { @@ -2699,36 +2592,27 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { for (int i = 1; i < GGML_MAX_DIMS; i++) { input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; } - ggml_cann_pool_alloc fp32_allocator1( - ctx.pool(), ggml_nelements(dst) * sizeof(float)); - void* input_fp32_buffer1 = fp32_allocator1.get(); - aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( - input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne, - input_fp32_nb, GGML_MAX_DIMS); - ggml_cann_pool_alloc fp32_allocator2( - ctx.pool(), ggml_nelements(dst) * sizeof(float)); - void* input_fp32_buffer2 = fp32_allocator2.get(); - aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( - input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne, - input_fp32_nb, GGML_MAX_DIMS); - - ggml_cann_pool_alloc fp32_allocator( - ctx.pool(), ggml_nelements(dst) * sizeof(float)); - output_fp32_buffer = fp32_allocator.get(); - aclTensor* output_fp32_tensor = ggml_cann_create_tensor( - output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne, - input_fp32_nb, GGML_MAX_DIMS); + ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float)); + void * input_fp32_buffer1 = fp32_allocator1.get(); + aclTensor * input_fp32_tensor1 = ggml_cann_create_tensor(input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float)); + void * input_fp32_buffer2 = fp32_allocator2.get(); + aclTensor * input_fp32_tensor2 = ggml_cann_create_tensor(input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + + ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); + output_fp32_buffer = fp32_allocator.get(); + aclTensor * output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1); - aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, - input_fp32_tensor2); - aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, - output_fp32_tensor); + aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, input_fp32_tensor2); + aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, output_fp32_tensor); aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); - ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2, - output_fp32_tensor, acl_sin_reshape_tensor, - acl_minus_one_tensor, acl_input_roll_mul_scale_tensor, - acl_input_roll_reshape_tensor, acl_src); + ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2, output_fp32_tensor, + acl_sin_reshape_tensor, acl_minus_one_tensor, acl_input_roll_mul_scale_tensor, + acl_input_roll_reshape_tensor, acl_src); } return; #endif @@ -2737,155 +2621,146 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { int64_t acl_mode = mode == 0 ? 1 : mode; switch (src0->type) { - case GGML_TYPE_F32: { - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src, - acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, acl_dst); - break; - } - case GGML_TYPE_F16: { - ggml_cann_pool_alloc src_trans_allocator( - ctx.pool(), ggml_nelements(src0) * sizeof(float)); - void* src_trans_buffer = src_trans_allocator.get(); - ggml_cann_pool_alloc dst_trans_allocator( - ctx.pool(), ggml_nelements(dst) * sizeof(float)); - void* dst_trans_buffer = dst_trans_allocator.get(); - - size_t src_trans_nb[GGML_MAX_DIMS]; - src_trans_nb[0] = sizeof(float); - for (int i = 1; i < GGML_MAX_DIMS; i++) { - src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + case GGML_TYPE_F32: + { + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src, acl_cos_reshape_tensor, + acl_sin_reshape_tensor, acl_mode, acl_dst); + break; } + case GGML_TYPE_F16: + { + ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float)); + void * src_trans_buffer = src_trans_allocator.get(); + ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float)); + void * dst_trans_buffer = dst_trans_allocator.get(); - aclTensor* acl_src_trans_tensor = ggml_cann_create_tensor( - src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, - GGML_MAX_DIMS); - aclTensor* acl_dst_trans_tensor = ggml_cann_create_tensor( - dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, - GGML_MAX_DIMS); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = sizeof(float); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } - aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT); + aclTensor * acl_src_trans_tensor = ggml_cann_create_tensor(src_trans_buffer, ACL_FLOAT, sizeof(float), + src0->ne, src_trans_nb, GGML_MAX_DIMS); + aclTensor * acl_dst_trans_tensor = ggml_cann_create_tensor(dst_trans_buffer, ACL_FLOAT, sizeof(float), + dst->ne, src_trans_nb, GGML_MAX_DIMS); - GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor, - acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, - acl_dst_trans_tensor); + aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT); - aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16); + GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor, acl_cos_reshape_tensor, + acl_sin_reshape_tensor, acl_mode, acl_dst_trans_tensor); - ggml_cann_release_resources(ctx, acl_src_trans_tensor, - acl_dst_trans_tensor); - break; - } + aclnn_cast(ctx, acl_dst_trans_tensor, acl_dst, ACL_FLOAT16); + + ggml_cann_release_resources(ctx, acl_src_trans_tensor, acl_dst_trans_tensor); + break; + } default: GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE"); break; } - ggml_cann_release_resources(ctx, acl_cos_reshape_tensor, - acl_sin_reshape_tensor, acl_src, acl_dst); + ggml_cann_release_resources(ctx, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_src, acl_dst); } - - void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){ +void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3); + aclTensor * acl_src = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3); GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst); ggml_cann_release_resources(ctx, acl_src, acl_dst); } -void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){ +void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; // stride - int64_t s0 = ((const int32_t*)(dst->op_params))[0]; + int64_t s0 = ((const int32_t *) (dst->op_params))[0]; - aclTensor* acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL); - aclTensor* acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL); - aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL); + aclTensor * acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL); + aclTensor * acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL); + aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL); int64_t strideVal[1]; - strideVal[0] = s0; - aclIntArray *stride = aclCreateIntArray(strideVal, 1); - int64_t paddingVal[] = {0}; - aclIntArray *padding = aclCreateIntArray(paddingVal, 1); - int64_t dilationVal[] = {1}; - aclIntArray *dilation = aclCreateIntArray(dilationVal, 1); - int8_t cubeMathType = 0; + strideVal[0] = s0; + aclIntArray * stride = aclCreateIntArray(strideVal, 1); + int64_t paddingVal[] = { 0 }; + aclIntArray * padding = aclCreateIntArray(paddingVal, 1); + int64_t dilationVal[] = { 1 }; + aclIntArray * dilation = aclCreateIntArray(dilationVal, 1); + int8_t cubeMathType = 0; #ifdef ASCEND_310P cubeMathType = 1; #endif - GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride, - padding, dilation, true, padding, 1, acl_dst, cubeMathType); + GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride, padding, dilation, true, padding, + 1, acl_dst, cubeMathType); ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation); } -void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){ +void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; - aclTensor* acl_input = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_input = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); - float alphaValue = 1.0f; - aclScalar* alpha = nullptr; - alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + float alphaValue = 1.0f; + aclScalar * alpha = nullptr; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); - GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha, - acl_dst); + GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input, alpha, alpha, alpha, acl_dst); ggml_cann_release_resources(ctx, acl_input, acl_dst, alpha); } -void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){ +void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); - int64_t reduceDimValue[] = {3}; - aclIntArray* reduceDim = aclCreateIntArray(reduceDimValue, 1); - bool keepDim = true; + int64_t reduceDimValue[] = { 3 }; + aclIntArray * reduceDim = aclCreateIntArray(reduceDimValue, 1); + bool keepDim = true; GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst); ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim); } -void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){ - ggml_tensor * src0 = dst->src[0]; - int32_t *opts = (int32_t *) dst->op_params; - int64_t paddingsArray[2] = {opts[0], opts[1]}; - aclIntArray* paddings = aclCreateIntArray(paddingsArray, 2); +void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + int32_t * opts = (int32_t *) dst->op_params; + int64_t paddingsArray[2] = { opts[0], opts[1] }; + aclIntArray * paddings = aclCreateIntArray(paddingsArray, 2); for (int64_t i = 0; i < src0->ne[3]; i++) { - aclTensor* acl_src = ggml_cann_create_tensor( - (char*)src0->data + i * src0->ne[3], - ggml_cann_type_mapping(src0->type), ggml_element_size(src0), - src0->ne, src0->nb, 3); + aclTensor * acl_src = + ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type), + ggml_element_size(src0), src0->ne, src0->nb, 3); - aclTensor* acl_dst = ggml_cann_create_tensor( - (char*)dst->data + i * src0->ne[3], - ggml_cann_type_mapping(dst->type), ggml_element_size(dst), - dst->ne, dst->nb, 3); + aclTensor * acl_dst = + ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type), + ggml_element_size(dst), dst->ne, dst->nb, 3); - GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst); + GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src, paddings, acl_dst); - ggml_cann_release_resources(ctx, acl_src, acl_dst); + ggml_cann_release_resources(ctx, acl_src, acl_dst); } ggml_cann_release_resources(ctx, paddings); } -void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){ +void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; ggml_tensor * src1 = dst->src[1]; - aclTensor* acl_self = ggml_cann_create_tensor(src0); - aclTensor* acl_other = ggml_cann_create_tensor(src1); + aclTensor * acl_self = ggml_cann_create_tensor(src0); + aclTensor * acl_other = ggml_cann_create_tensor(src1); GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other); @@ -2894,15 +2769,15 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){ ggml_cann_release_resources(ctx, acl_self, acl_other); } -void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){ +void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) { ggml_tensor * src0 = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src0); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src0); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); - float alphaValue = 0.0f; - aclScalar* alpha = nullptr; - alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + float alphaValue = 0.0f; + aclScalar * alpha = nullptr; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst); @@ -2927,7 +2802,7 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){ * @note This function assumes floating-point data types and is designed for * MoE architectures, possibly involving sparse expert routing. */ -static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) { //dst [M, K, N, 1] ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] -> [D, M, K, 1] ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1] @@ -2941,36 +2816,42 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* GGML_ASSERT(batch == ids->ne[1]); ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0)); - void* export_ptr = export_allocator.get(); + void * export_ptr = export_allocator.get(); for (int64_t i = 0; i < batch; i++) { - aclTensor *select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]); - aclTensor *export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3); + aclTensor * select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]); + aclTensor * export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3); - int64_t select_export_ne[] = {src0->ne[0], src0->ne[1], ids->ne[0]}; - size_t select_export_nb[3]; + int64_t select_export_ne[] = { src0->ne[0], src0->ne[1], ids->ne[0] }; + size_t select_export_nb[3]; select_export_nb[0] = src0->nb[0]; - for (int k = 1;k < 3; k++) { - select_export_nb[k] = select_export_nb[k-1] * select_export_ne[k-1]; + for (int k = 1; k < 3; k++) { + select_export_nb[k] = select_export_nb[k - 1] * select_export_ne[k - 1]; } - aclTensor *select_export = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_export_ne, select_export_nb, 3); + aclTensor * select_export = + ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), + select_export_ne, select_export_nb, 3); GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight, 0, select_index, select_export); - int64_t select_transpose_ne[] = {select_export_ne[1], select_export_ne[0], select_export_ne[2]}; - size_t select_transpose_nb[] = {select_export_nb[1], select_export_nb[0], select_export_nb[2]}; - aclTensor *select_export_transpose = ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), select_transpose_ne, select_transpose_nb, 3); + int64_t select_transpose_ne[] = { select_export_ne[1], select_export_ne[0], select_export_ne[2] }; + size_t select_transpose_nb[] = { select_export_nb[1], select_export_nb[0], select_export_nb[2] }; + aclTensor * select_export_transpose = + ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0), + select_transpose_ne, select_transpose_nb, 3); - int64_t active_tensor_ne[] = {src1->ne[0], 1, src1->ne[1]}; - size_t active_tensor_nb[] = {src1->nb[0], src1->nb[1], src1->nb[1]}; - aclTensor *active_tensor = ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]); + int64_t active_tensor_ne[] = { src1->ne[0], 1, src1->ne[1] }; + size_t active_tensor_nb[] = { src1->nb[0], src1->nb[1], src1->nb[1] }; + aclTensor * active_tensor = + ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]); - int64_t dst_ne[] = {dst->ne[0], 1, dst->ne[1]}; - size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[1]}; - aclTensor *acl_dst = ggml_cann_create_tensor(dst, dst_ne,dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]); + int64_t dst_ne[] = { dst->ne[0], 1, dst->ne[1] }; + size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[1] }; + aclTensor * acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]); GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor, select_export_transpose, acl_dst, 2); - ggml_cann_release_resources(ctx, select_index, export_weight, select_export, active_tensor, acl_dst, select_export_transpose); + ggml_cann_release_resources(ctx, select_index, export_weight, select_export, active_tensor, acl_dst, + select_export_transpose); } } @@ -2997,7 +2878,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* * @note This function assumes quantized data types and is designed for * MoE architectures with potential sparse expert routing. */ -static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) { // TODO: Use aclnnGroupedMatMul //dst [M, K, N, 1] ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] @@ -3007,24 +2888,23 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens GGML_TENSOR_BINARY_OP_LOCALS // copy index from npu to cpu - int64_t n_as = ne02; // A - int64_t n_ids = ids->ne[0]; // K + int64_t n_as = ne02; // A + int64_t n_ids = ids->ne[0]; // K std::vector ids_host(ggml_nbytes(ids)); - ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids), - ACL_MEMCPY_DEVICE_TO_HOST); + ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids), ACL_MEMCPY_DEVICE_TO_HOST); ACL_CHECK(aclrtSynchronizeStream(ctx.stream())); char * src0_original = (char *) src0->data; char * src1_original = (char *) src1->data; - char * dst_original = (char *) dst->data; + char * dst_original = (char *) dst->data; ggml_tensor src0_row = *src0; ggml_tensor src1_row = *src1; - ggml_tensor dst_row = *dst; + ggml_tensor dst_row = *dst; const enum ggml_type type = dst->src[0]->type; - float weight_elem_size; + float weight_elem_size; if (type == GGML_TYPE_Q4_0) { weight_elem_size = float(sizeof(uint8_t)) / 2; } else if (type == GGML_TYPE_Q8_0) { @@ -3034,18 +2914,18 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens } // src0_row [D, M, 1, 1] weight without permute - src0_row.ne[2] = 1; - src0_row.ne[3] = 1; - src0_row.nb[0] = weight_elem_size; - src0_row.nb[1] = weight_elem_size * ne00; - src0_row.nb[2] = weight_elem_size * ne00; - src0_row.nb[3] = weight_elem_size * ne00; + src0_row.ne[2] = 1; + src0_row.ne[3] = 1; + src0_row.nb[0] = weight_elem_size; + src0_row.nb[1] = weight_elem_size * ne00; + src0_row.nb[2] = weight_elem_size * ne00; + src0_row.nb[3] = weight_elem_size * ne00; size_t weight_stride = ne00 * ne01 * weight_elem_size; - size_t weight_size = weight_stride * ne02 * ne03; + size_t weight_size = weight_stride * ne02 * ne03; // scale [D, M, 1, 1] -> scale && permute size_t scale_elem_size = sizeof(uint16_t); - size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; + size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; // src1_row [D, 1, 1, 1] -> input src1_row.ne[1] = 1; @@ -3063,11 +2943,11 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens //create weight for one row ggml_cann_pool_alloc weight_allocator(ctx.pool()); - void* weight_buffer = weight_allocator.alloc(nb02); + void * weight_buffer = weight_allocator.alloc(nb02); for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { for (int64_t id = 0; id < n_ids; id++) { // expert index - int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]); GGML_ASSERT(i02 >= 0 && i02 < n_as); // If B = 1 (broadcast), always use 0; otherwise, use id. @@ -3077,21 +2957,19 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens int64_t i1 = id; int64_t i2 = i12; - void* src0_tmp_ptr = src0_original + i02*weight_stride; - void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride; - void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; - void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2; + void * src0_tmp_ptr = src0_original + i02 * weight_stride; + void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride; + void * src1_tmp_ptr = src1_original + i11 * nb11 + i12 * nb12; + void * dst_tmp_ptr = dst_original + i1 * nb1 + i2 * nb2; // mem cpy - ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride, - ACL_MEMCPY_DEVICE_TO_DEVICE); - void* scale_buffer = (char*)weight_buffer + weight_stride; - ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride, - ACL_MEMCPY_DEVICE_TO_DEVICE); - - src0_row.data = weight_buffer; - src1_row.data = src1_tmp_ptr; - dst_row.data = dst_tmp_ptr; + ggml_cann_async_memcpy(ctx, weight_buffer, src0_tmp_ptr, weight_stride, ACL_MEMCPY_DEVICE_TO_DEVICE); + void * scale_buffer = (char *) weight_buffer + weight_stride; + ggml_cann_async_memcpy(ctx, scale_buffer, scale_tmp_ptr, scale_stride, ACL_MEMCPY_DEVICE_TO_DEVICE); + + src0_row.data = weight_buffer; + src1_row.data = src1_tmp_ptr; + dst_row.data = dst_tmp_ptr; dst_row.src[0] = &src0_row; dst_row.src[1] = &src1_row; @@ -3101,7 +2979,7 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens return; } -void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) { +void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) { const enum ggml_type type = dst->src[0]->type; switch (type) { case GGML_TYPE_F32: @@ -3118,12 +2996,11 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) { } } -void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ - - ggml_tensor* src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont) - ggml_tensor* src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont) - ggml_tensor* src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont) - ggml_tensor* src3 = dst->src[3]; // mask, fp16 +void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont) + ggml_tensor * src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont) + ggml_tensor * src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont) + ggml_tensor * src3 = dst->src[3]; // mask, fp16 // B, N, S, D (uncont) -> B, S, N, D (cont) int64_t src0_bsnd_ne[GGML_MAX_DIMS]; @@ -3139,107 +3016,96 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ size_t src2_bsnd_nb[GGML_MAX_DIMS]; memcpy(src2_bsnd_nb, src2->nb, GGML_MAX_DIMS * sizeof(size_t)); - auto transpose12 = [](int64_t* ne, size_t* nb) { + auto transpose12 = [](int64_t * ne, size_t * nb) { int64_t ne_tmp = ne[1]; size_t nb_tmp = nb[1]; - ne[1] = ne[2]; - nb[1] = nb[2]; - ne[2] = ne_tmp; - nb[2] = nb_tmp; + ne[1] = ne[2]; + nb[1] = nb[2]; + ne[2] = ne_tmp; + nb[2] = nb_tmp; }; transpose12(src0_bsnd_ne, src0_bsnd_nb); transpose12(src1_bsnd_ne, src1_bsnd_nb); transpose12(src2_bsnd_ne, src2_bsnd_nb); - float maxBias = 0.0f; - float scaleValue = 1.0f; + float maxBias = 0.0f; + float scaleValue = 1.0f; float logitSoftcap = 0.0f; - memcpy(&scaleValue, (float*)dst->op_params + 0, sizeof(float)); - memcpy(&maxBias, (float*)dst->op_params + 1, sizeof(float)); - memcpy(&logitSoftcap, (float*)dst->op_params + 2, sizeof(float)); + memcpy(&scaleValue, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&maxBias, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&logitSoftcap, (float *) dst->op_params + 2, sizeof(float)); - if(logitSoftcap == 0.0f){ + if (logitSoftcap == 0.0f) { size_t faElemSize = sizeof(uint16_t); - auto faDataType = ACL_FLOAT16; //ACL_BF16; + auto faDataType = ACL_FLOAT16; //ACL_BF16; - aclTensor* acl_src0_f16_tensor = nullptr; - aclTensor* acl_src1_f16_tensor = nullptr; - aclTensor* acl_src2_f16_tensor = nullptr; + aclTensor * acl_src0_f16_tensor = nullptr; + aclTensor * acl_src1_f16_tensor = nullptr; + aclTensor * acl_src2_f16_tensor = nullptr; // Step 1: cast the src0 (Query) to fp16 if needed ggml_cann_pool_alloc src0_f16_allocator(ctx.pool()); - void* src0_f16_buffer = nullptr; + void * src0_f16_buffer = nullptr; - if(ggml_cann_type_mapping(src0->type) != faDataType){ - aclTensor* acl_src0_f32_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, - src0_bsnd_nb, GGML_MAX_DIMS); - src0_f16_buffer = src0_f16_allocator.alloc( - ggml_nelements(src0) * faElemSize); + if (ggml_cann_type_mapping(src0->type) != faDataType) { + aclTensor * acl_src0_f32_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS); + src0_f16_buffer = src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize); - int64_t* src0_f16_ne = src0_bsnd_ne; - size_t src0_f16_nb[GGML_MAX_DIMS]; + int64_t * src0_f16_ne = src0_bsnd_ne; + size_t src0_f16_nb[GGML_MAX_DIMS]; src0_f16_nb[0] = sizeof(uint16_t); - for(int i = 1; i < GGML_MAX_DIMS; ++i){ + for (int i = 1; i < GGML_MAX_DIMS; ++i) { src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1]; } - acl_src0_f16_tensor = ggml_cann_create_tensor( - src0_f16_buffer, faDataType, faElemSize, - src0_f16_ne, src0_f16_nb, GGML_MAX_DIMS - ); + acl_src0_f16_tensor = ggml_cann_create_tensor(src0_f16_buffer, faDataType, faElemSize, src0_f16_ne, + src0_f16_nb, GGML_MAX_DIMS); aclnn_cast(ctx, acl_src0_f32_tensor, acl_src0_f16_tensor, faDataType); ggml_cann_release_resources(ctx, acl_src0_f32_tensor); - }else{ - acl_src0_f16_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, - src0_bsnd_nb, GGML_MAX_DIMS); + } else { + acl_src0_f16_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS); } // Step 2: create the acl tensors for src1 (Key), src2 (Value), // and the direct output from FusedInferAttention - acl_src1_f16_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, - src1_bsnd_nb, GGML_MAX_DIMS); - acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, - src2_bsnd_nb, GGML_MAX_DIMS); + acl_src1_f16_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS); + acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS); // Step 3: create the PSEShift tensor if needed // this tensor is considered as mask (f16) in the llama.cpp - aclTensor* bcast_pse_tensor = nullptr; + aclTensor * bcast_pse_tensor = nullptr; ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool()); - if(src3 != nullptr){ + if (src3 != nullptr) { // Construct the truncated pse tensor (common for prefill/decode) int64_t trunc_pse_ne[GGML_MAX_DIMS] = { - src3->ne[0], // D - src0->ne[1], // S (number of Q tokens) - src3->ne[2], // mask N - src3->ne[3] // B + src3->ne[0], // D + src0->ne[1], // S (number of Q tokens) + src3->ne[2], // mask N + src3->ne[3] // B }; - size_t* trunc_pse_nb = src3->nb; + size_t * trunc_pse_nb = src3->nb; - aclTensor* acl_mask_f16_trunc_tensor = ggml_cann_create_tensor( - src3->data, ACL_FLOAT16, sizeof(uint16_t), - trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS - ); + aclTensor * acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), + trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS); int64_t bcast_pse_ne[GGML_MAX_DIMS]; - size_t bcast_pse_nb[GGML_MAX_DIMS]; - bcast_pse_ne[0] = src3->ne[0]; // D - bcast_pse_ne[1] = src0->ne[1]; // S - bcast_pse_ne[2] = src0->ne[2]; // N (num_heads) - bcast_pse_ne[3] = src3->ne[3]; // B + size_t bcast_pse_nb[GGML_MAX_DIMS]; + bcast_pse_ne[0] = src3->ne[0]; // D + bcast_pse_ne[1] = src0->ne[1]; // S + bcast_pse_ne[2] = src0->ne[2]; // N (num_heads) + bcast_pse_ne[3] = src3->ne[3]; // B if (maxBias == 0.0f) { // When maxBias == 0.0f, use nb = 0 reduce once repeat (Qwen2) // Construct the bcast tensor (simulate repeat on the head dimension using stride=0) bcast_pse_nb[0] = sizeof(uint16_t); bcast_pse_nb[1] = bcast_pse_nb[0] * bcast_pse_ne[0]; - bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data + bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data bcast_pse_nb[3] = src3->nb[3]; - bcast_pse_tensor = ggml_cann_create_tensor( - src3->data, ACL_FLOAT16, sizeof(uint16_t), - bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS - ); + bcast_pse_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), bcast_pse_ne, + bcast_pse_nb, GGML_MAX_DIMS); ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor); } else { @@ -3248,35 +3114,31 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){ bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1]; } - void* bcast_pse_buffer = bcast_pse_allocator.alloc( - ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t) - ); + void * bcast_pse_buffer = + bcast_pse_allocator.alloc(ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t)); - bcast_pse_tensor = ggml_cann_create_tensor( - bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t), - bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS - ); + bcast_pse_tensor = ggml_cann_create_tensor(bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t), + bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS); - int64_t repeats[] = {1, src0->ne[2], 1, 1}; + int64_t repeats[] = { 1, src0->ne[2], 1, 1 }; aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats); // alibi // Compute the slope if needed. Derived from ggml_cann_softmax(). - const int64_t n_heads = src0->ne[2]; + const int64_t n_heads = src0->ne[2]; ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t)); - void* slope_buffer = slope_allocator.get(); + void * slope_buffer = slope_allocator.get(); aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16); - int64_t slope_ne[] = {1, 1, n_heads, 1}; - size_t slope_nb[GGML_MAX_DIMS]; + int64_t slope_ne[] = { 1, 1, n_heads, 1 }; + size_t slope_nb[GGML_MAX_DIMS]; slope_nb[0] = sizeof(uint16_t); - for(int i = 1;ine[2]; // N - int64_t numKeyValueHeads = src1->ne[2]; + int kvTensorNum = 1; + aclTensor * acl_q_tensor = acl_src0_f16_tensor; + aclTensor * acl_k_tensors[] = { acl_src1_f16_tensor }; + aclTensor * acl_v_tensors[] = { acl_src2_f16_tensor }; + aclTensorList * acl_k_tensor_list = aclCreateTensorList(acl_k_tensors, kvTensorNum); + aclTensorList * acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum); + + int64_t numHeads = src0->ne[2]; // N + int64_t numKeyValueHeads = src1->ne[2]; // double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d) - int64_t preTokens = 65535; - int64_t nextTokens = 65535; - char layout[5] = {'B', 'S', 'N', 'D', 0}; - int64_t sparseMode = 0; - int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2; - int64_t blockSize = 0; - int64_t antiquantMode = 0; - bool softmaxLseFlag = false; - int64_t keyAntiquantMode = 0; + int64_t preTokens = 65535; + int64_t nextTokens = 65535; + char layout[5] = { 'B', 'S', 'N', 'D', 0 }; + int64_t sparseMode = 0; + int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2; + int64_t blockSize = 0; + int64_t antiquantMode = 0; + bool softmaxLseFlag = false; + int64_t keyAntiquantMode = 0; int64_t valueAntiquantMode = 0; GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - aclTensor * fa_dst_tensor = nullptr; - aclTensor * acl_dst_tensor = nullptr; + aclTensor * fa_dst_tensor = nullptr; + aclTensor * acl_dst_tensor = nullptr; ggml_cann_pool_alloc out_f16_allocator(ctx.pool()); if (dst->type == GGML_TYPE_F32) { - void* out_f16_buffer = out_f16_allocator.alloc( - ggml_nelements(dst) * faElemSize); + void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize); - int64_t* out_f16_ne = src0_bsnd_ne; - size_t out_f16_nb[GGML_MAX_DIMS]; + int64_t * out_f16_ne = src0_bsnd_ne; + size_t out_f16_nb[GGML_MAX_DIMS]; out_f16_nb[0] = faElemSize; - for(int i = 1; i < GGML_MAX_DIMS; ++i){ + for (int i = 1; i < GGML_MAX_DIMS; ++i) { out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1]; } - fa_dst_tensor = ggml_cann_create_tensor( - out_f16_buffer, faDataType, faElemSize, - out_f16_ne, out_f16_nb, GGML_MAX_DIMS - ); - } - else { + fa_dst_tensor = + ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS); + } else { fa_dst_tensor = ggml_cann_create_tensor(dst); } - GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, - acl_q_tensor, acl_k_tensor_list, acl_v_tensor_list, // q, k, v - bcast_pse_tensor, nullptr, // pse, mask - nullptr, nullptr, // actSeqLen, actSeqLenkv - nullptr, nullptr, // deqScale1, quantScale1 - nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2 - nullptr, nullptr, // antiquantScale, antiquantOffset - nullptr, // blockTable - nullptr, nullptr, // qPadSize, kvPadSize - nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset - nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset - nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen - numHeads, scaleValue, // heads, scaleValue - preTokens, nextTokens, // preTokens, nextTokens - layout, // inputLayout - numKeyValueHeads, // numKVHeads - sparseMode, innerPrecise, // sparseMode, innerPrecise - blockSize, antiquantMode, // blockSize, antiquantMode - softmaxLseFlag, // softmaxLseFlag - keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode - fa_dst_tensor, // attentionOut - nullptr // softmaxLse + GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor, acl_k_tensor_list, + acl_v_tensor_list, // q, k, v + bcast_pse_tensor, nullptr, // pse, mask + nullptr, nullptr, // actSeqLen, actSeqLenkv + nullptr, nullptr, // deqScale1, quantScale1 + nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2 + nullptr, nullptr, // antiquantScale, antiquantOffset + nullptr, // blockTable + nullptr, nullptr, // qPadSize, kvPadSize + nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset + nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset + nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen + numHeads, scaleValue, // heads, scaleValue + preTokens, nextTokens, // preTokens, nextTokens + layout, // inputLayout + numKeyValueHeads, // numKVHeads + sparseMode, innerPrecise, // sparseMode, innerPrecise + blockSize, antiquantMode, // blockSize, antiquantMode + softmaxLseFlag, // softmaxLseFlag + keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode + fa_dst_tensor, // attentionOut + nullptr // softmaxLse ); if (dst->type == GGML_TYPE_F32) { // Step 6: post-processing, permute and cast to f32 - aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); + aclTensor * acl_dst_tensor = ggml_cann_create_tensor(dst); aclnn_cast(ctx, fa_dst_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type)); } - ggml_cann_release_resources(ctx, acl_src0_f16_tensor, - acl_k_tensor_list, - acl_v_tensor_list, - fa_dst_tensor, - acl_dst_tensor, - bcast_pse_tensor); + ggml_cann_release_resources(ctx, acl_src0_f16_tensor, acl_k_tensor_list, acl_v_tensor_list, fa_dst_tensor, + acl_dst_tensor, bcast_pse_tensor); } else { GGML_ABORT("Function is not implemented."); diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h old mode 100755 new mode 100644 index 5c510cc9932e8..ec7455af88cd5 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -62,7 +62,7 @@ * @param dst The ggml tensor representing the destination, which op is * GGML_OP_REPEAT and specifies the desired dimensions. */ -void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies the Leaky ReLU activation function to a tensor using the CANN @@ -82,7 +82,7 @@ void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the result of the Leaky ReLU * activation is stored, which op is `GGML_OP_LEAKY_RELU` */ -void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Concatenates multiple tensors along a specified dimension using the @@ -97,7 +97,7 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @attention tensorList length should be 2 and the dimension using for concat * default to 1. */ -void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Generates a sequence of evenly spaced values within a specified @@ -113,7 +113,7 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst); * `start`, 'stop' and 'step' are in dst->op_params and dst->op is * `GGML_OP_ARANGE`. */ -void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies a clamp operation to the elements of a ggml tensor using the @@ -131,7 +131,7 @@ void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the clamped values will be stored. * dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params. */ -void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Scales the elements of a ggml tensor by a constant factor using the @@ -148,7 +148,7 @@ void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the scaled values will be stored. * dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params. */ -void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Sorts the elements of a ggml tensor and returns the indices that @@ -163,7 +163,7 @@ void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the sorted indices will be stored. * dst->op is `GGML_OP_ARGSORT`. */ -void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the Layer Normalization for a ggml tensor using the CANN @@ -185,7 +185,7 @@ void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the normalized values will be stored. * @attention `Var` defaults to dst->ne[0]. */ -void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the Group Normalization for a ggml tensor using the CANN @@ -209,7 +209,7 @@ void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); * * @attention eps defaults to 1e-6f. */ -void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the accumulation of tensors using the CANN backend. @@ -228,7 +228,7 @@ void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the accumulated values will be stored. * `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`. */ -void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the sum of elements along the last dimension of a ggml tensor @@ -244,7 +244,7 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst); * * @attention `reduce_dims` defaults to 3, which means the last dimension. */ -void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the sum of elements in a ggml tensor. @@ -258,7 +258,7 @@ void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); * */ -void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Upsamples a ggml tensor using nearest neighbor interpolation using @@ -274,8 +274,7 @@ void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the upsampled values will be stored. * dst->op is `GGML_OP_UPSCALE`. */ -void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, - ggml_tensor* dst); +void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Pads a ggml tensor to match the dimensions of the destination tensor @@ -290,7 +289,7 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, * @param dst The destination tensor, which specifies the target dimensions for * padding. dst->op is `GGML_OP_PAD`. */ -void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Executes a 2D pooling operation on a ggml tensor using the CANN @@ -307,7 +306,7 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor on which the pooling operation is to be * performed. dst->op is `GGML_OP_POOL_2D`. */ -void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Duplicates a ggml tensor using the CANN backend. @@ -326,7 +325,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst); * different shape and dst is no-contiguous. * @note: This func need to simplify. */ -void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor @@ -348,7 +347,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the normalized values will be stored. * dst->op is `GGML_OP_RMS_NORM`. */ -void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies a diagonal mask to the tensor with a specified value. @@ -363,7 +362,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); * `GGML_OP_DIAG_MASK` * @param value The value to use for masking. */ -void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value); +void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value); /** * @brief Performs an image-to-column transformation on the input tensor. @@ -378,7 +377,7 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float * @param dst The destination tensor that stores the result of the operation. * dst->op is `GGML_OP_IM2COL`. */ -void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes time step embeddings using sine and cosine functions. @@ -392,10 +391,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the result of the embedding operation * will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`. */ -void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst); // @see ggml_cann_dup. -void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the softmax activation with optional masking. @@ -417,7 +416,7 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the result will be stored. dst->op is * `GGML_OP_SOFTMAX`. */ -void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Extracts specific rows from a tensor based on indices. @@ -429,7 +428,7 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param ctx The backend CANN context for executing operations. * @param dst The destination tensor where the extracted rows will be stored. */ -void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Writes specific rows into a tensor at positions specified by indices. @@ -441,7 +440,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param ctx The backend CANN context for executing operations. * @param dst The destination tensor where the specified rows will be updated. */ -void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Executes matrix multiplication for the given tensor. @@ -454,7 +453,7 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor for storing the result of the matrix * multiplication. dst->op is `GGML_OP_MUL_MAT`. */ -void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor. @@ -477,7 +476,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @note The function currently does not support cases where the freq_scale is * not equal 1. */ -void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the index of the maximum value along the specified dimension @@ -492,7 +491,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the indices of the maximum values will * be stored. dst->op is `GGML_OP_ARGMAX`. */ -void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Adds two tensors element-wise and stores the result in a destination @@ -509,8 +508,10 @@ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param acl_src1 The second source tensor. * @param acl_dst The destination tensor where the result will be stored. */ -void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, - aclTensor* acl_src1, aclTensor* acl_dst = nullptr); +void aclnn_add(ggml_backend_cann_context & ctx, + aclTensor * acl_src0, + aclTensor * acl_src1, + aclTensor * acl_dst = nullptr); /** * @brief Sub two tensors element-wise and stores the result in a destination @@ -527,8 +528,10 @@ void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, * @param acl_src1 The second source tensor. * @param acl_dst The destination tensor where the result will be stored. */ -void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0, - aclTensor* acl_src1, aclTensor* acl_dst = nullptr); +void aclnn_sub(ggml_backend_cann_context & ctx, + aclTensor * acl_src0, + aclTensor * acl_src1, + aclTensor * acl_dst = nullptr); /** * @brief Performs element-wise multiplication of two tensors and stores the @@ -546,8 +549,10 @@ void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0, * @param acl_other The second tensor for element-wise multiplication. * @param acl_dst The destination tensor where the result will be stored. */ -void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, aclTensor* acl_dst = nullptr); +void aclnn_mul(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_other, + aclTensor * acl_dst = nullptr); /** * @brief Matrix division, optionally in-place. @@ -567,8 +572,10 @@ void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param inplace Flag indicating whether to perform the operation in-place on * `acl_src`. */ -void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_other, aclTensor* acl_dst = nullptr); +void aclnn_div(ggml_backend_cann_context & ctx, + aclTensor * acl_src, + aclTensor * acl_other, + aclTensor * acl_dst = nullptr); /** * @brief Applies element-wise cosine function to the elements of a tensor. @@ -584,8 +591,7 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param acl_dst The destination tensor where the cosine results will be * stored. */ -void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst); +void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst); /** * @brief Applies element-wise sine function to the elements of a tensor. @@ -602,8 +608,7 @@ void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param acl_src The source tensor on which the sine function will be applied. * @param acl_dst The destination tensor where the sine results will be stored. */ -void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, - aclTensor* acl_dst); +void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst); /** * @brief Prepares broadcast-compatible ACL tensors for two input tensors and one @@ -621,8 +626,12 @@ void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, * @param acl_src1 Output pointer to the created ACL tensor corresponding to src1. * @param acl_dst Output pointer to the created ACL tensor corresponding to dst. */ -void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, - aclTensor ** acl_src0, aclTensor ** acl_src1, aclTensor ** acl_dst); +void bcast_shape(ggml_tensor * src0, + ggml_tensor * src1, + ggml_tensor * dst, + aclTensor ** acl_src0, + aclTensor ** acl_src1, + aclTensor ** acl_dst); /** * @brief Computes the 1D transposed convolution (deconvolution) of a ggml @@ -637,7 +646,7 @@ void bcast_shape(ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst, * @param dst The destination tensor where the transposed convolution result * will be stored. dst->op is `GGML_OP_CONV_TRANSPOSE_1D`. */ -void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies the ELU (Exponential Linear Unit) activation to a ggml tensor @@ -662,7 +671,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds * @param dst The destination tensor where the ELU-activated result will be stored. * dst->op is expected to be `GGML_OP_ELU`. */ -void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Computes the mean of a ggml tensor element-wise using the CANN backend. @@ -677,7 +686,7 @@ void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the mean result will be stored. * dst->op is expected to be `GGML_OP_MEAN`. */ -void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies 1D reflect padding to a ggml tensor using the CANN backend. @@ -692,7 +701,7 @@ void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the padded result will be stored. * dst->op is expected to be `GGML_OP_PAD_REFLECT_1D`. */ -void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Counts the number of equal elements in two ggml tensors using the CANN backend. @@ -708,7 +717,7 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the result will be stored. * dst->op is expected to be `GGML_OP_COUNT_EQUAL`. */ -void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Applies the Step activation function to a ggml tensor using the CANN backend. @@ -723,7 +732,7 @@ void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the result will be stored. * dst->op is expected to be `GGML_OP_STEP`. */ -void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Performs the Flash Attention extended operator using the CANN backend. @@ -738,59 +747,46 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst); * @param dst The destination tensor where the result will be stored. * dst->op is expected to be `GGML_OP_FLASH_ATTN_EXT`. */ -void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst); /* * @brief A generic wrapper for ACL resources with custom deleter support. */ -using any_acl_resource = std::unique_ptr>; +using any_acl_resource = std::unique_ptr>; /** * @brief Trait structure used to define how to destroy a given ACL resource type. * * @tparam T ACL resource type. */ -template -struct acl_resource_traits; +template struct acl_resource_traits; /** * @brief Specialization for aclTensor, defines how to destroy an aclTensor resource. */ -template<> -struct acl_resource_traits { - static void destroy(void* p) { - ACL_CHECK(aclDestroyTensor(static_cast(p))); - } +template <> struct acl_resource_traits { + static void destroy(void * p) { ACL_CHECK(aclDestroyTensor(static_cast(p))); } }; /** * @brief Specialization for aclIntArray, defines how to destroy an aclIntArray resource. */ -template<> -struct acl_resource_traits { - static void destroy(void* p) { - ACL_CHECK(aclDestroyIntArray(static_cast(p))); - } +template <> struct acl_resource_traits { + static void destroy(void * p) { ACL_CHECK(aclDestroyIntArray(static_cast(p))); } }; /** * @brief Specialization for aclScalar, defines how to destroy an aclScalar resource. */ -template<> -struct acl_resource_traits { - static void destroy(void* p) { - ACL_CHECK(aclDestroyScalar(static_cast(p))); - } +template <> struct acl_resource_traits { + static void destroy(void * p) { ACL_CHECK(aclDestroyScalar(static_cast(p))); } }; /** * @brief Specialization for aclTensorList, defines how to destroy an aclTensorList resource. */ -template<> -struct acl_resource_traits { - static void destroy(void* p) { - ACL_CHECK(aclDestroyTensorList(static_cast(p))); - } +template <> struct acl_resource_traits { + static void destroy(void * p) { ACL_CHECK(aclDestroyTensorList(static_cast(p))); } }; /** @@ -800,14 +796,8 @@ struct acl_resource_traits { * @param ptr Raw pointer to ACL resource. * @return any_acl_resource Smart pointer that handles destruction. */ -template -any_acl_resource make_acl_resource(T* ptr) { - return any_acl_resource( - static_cast(ptr), - [](void* p) { - acl_resource_traits::destroy(p); - } - ); +template any_acl_resource make_acl_resource(T * ptr) { + return any_acl_resource(static_cast(ptr), [](void * p) { acl_resource_traits::destroy(p); }); } /** @@ -817,8 +807,7 @@ any_acl_resource make_acl_resource(T* ptr) { * @param vec Target vector to hold ACL resources. * @param args Raw pointers to ACL resources. */ -template -void register_acl_resources(std::vector& vec, Args*... args) { +template void register_acl_resources(std::vector & vec, Args *... args) { (vec.emplace_back(make_acl_resource(args)), ...); } @@ -826,39 +815,36 @@ void register_acl_resources(std::vector& vec, Args*... args) { * @brief Task class that wraps the execution of an aclnn function call. */ class aclnn_task : public cann_task { - public: - aclnn_task(aclnn_func_t aclnn_func, void * workspace_addr, - uint64_t workspace_size, aclOpExecutor * executor, - aclrtStream stream) : - aclnn_func_(aclnn_func), - workspace_addr_(workspace_addr), - workspace_size_(workspace_size), - executor_(executor), - stream_(stream) {} - virtual void run_task() override { - ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_)); - } - private: - aclnn_func_t aclnn_func_; - void * workspace_addr_; - uint64_t workspace_size_; - aclOpExecutor * executor_; - aclrtStream stream_; + public: + aclnn_task(aclnn_func_t aclnn_func, + void * workspace_addr, + uint64_t workspace_size, + aclOpExecutor * executor, + aclrtStream stream) : + aclnn_func_(aclnn_func), + workspace_addr_(workspace_addr), + workspace_size_(workspace_size), + executor_(executor), + stream_(stream) {} + + virtual void run_task() override { ACL_CHECK(aclnn_func_(workspace_addr_, workspace_size_, executor_, stream_)); } + private: + aclnn_func_t aclnn_func_; + void * workspace_addr_; + uint64_t workspace_size_; + aclOpExecutor * executor_; + aclrtStream stream_; }; /** * @brief Task class that releases ACL resources after usage. */ class release_resource_task : public cann_task { -public: - release_resource_task(std::vector&& resources){ - resource_ = std::move(resources); - } + public: + release_resource_task(std::vector && resources) { resource_ = std::move(resources); } - virtual void run_task() override { - resource_.clear(); - } -private: + virtual void run_task() override { resource_.clear(); } + private: std::vector resource_; }; @@ -866,38 +852,40 @@ class release_resource_task : public cann_task { * @brief Task class for performing asynchronous memory copy operations. */ class async_memcpy_task : public cann_task { -public: - async_memcpy_task(void* dst, const void* src, size_t size, - aclrtMemcpyKind kind, aclrtStream stream) - : dst_(dst), src_(src), size_(size), kind_(kind), stream_(stream) {} - - virtual void run_task() override { - ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_)); - } -private: - void* dst_; - const void* src_; - size_t size_; + public: + async_memcpy_task(void * dst, const void * src, size_t size, aclrtMemcpyKind kind, aclrtStream stream) : + dst_(dst), + src_(src), + size_(size), + kind_(kind), + stream_(stream) {} + + virtual void run_task() override { ACL_CHECK(aclrtMemcpyAsync(dst_, size_, src_, size_, kind_, stream_)); } + private: + void * dst_; + const void * src_; + size_t size_; aclrtMemcpyKind kind_; - aclrtStream stream_; + aclrtStream stream_; }; /** * @brief Task class for performing asynchronous memory set operations. */ class async_memset_task : public cann_task { - public: - async_memset_task(void* buffer, size_t size, int32_t value, aclrtStream stream) - : buffer_(buffer), size_(size), value_(value), stream_(stream) {} - - virtual void run_task() override { - ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_)); - } - private: - void* buffer_; - size_t size_; - int32_t value_; - aclrtStream stream_; + public: + async_memset_task(void * buffer, size_t size, int32_t value, aclrtStream stream) : + buffer_(buffer), + size_(size), + value_(value), + stream_(stream) {} + + virtual void run_task() override { ACL_CHECK(aclrtMemsetAsync(buffer_, size_, value_, size_, stream_)); } + private: + void * buffer_; + size_t size_; + int32_t value_; + aclrtStream stream_; }; /** @@ -918,25 +906,24 @@ class async_memset_task : public cann_task { * same stream are executed in queue order. */ -#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \ - do { \ - uint64_t workspaceSize = 0; \ - aclOpExecutor * executor; \ - void * workspaceAddr = nullptr; \ - ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor));\ - /* workspace should alloced in main thread to keep malloc order when using vmm. */ \ - if (workspaceSize > 0) { \ - ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \ - workspaceAddr = workspace_allocator.get(); \ - } \ - if (CTX.async_mode) { \ - auto task = \ - std::make_unique(aclnn##OP_NAME, workspaceAddr, workspaceSize, \ - executor, CTX.stream()); \ - CTX.task_queue.submit_task(std::move(task)); \ - } else { \ - ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream()));\ - } \ +#define GGML_CANN_CALL_ACLNN_OP(CTX, OP_NAME, ...) \ + do { \ + uint64_t workspaceSize = 0; \ + aclOpExecutor * executor; \ + void * workspaceAddr = nullptr; \ + ACL_CHECK(aclnn##OP_NAME##GetWorkspaceSize(__VA_ARGS__, &workspaceSize, &executor)); \ + /* workspace should alloced in main thread to keep malloc order when using vmm. */ \ + if (workspaceSize > 0) { \ + ggml_cann_pool_alloc workspace_allocator(CTX.pool(), workspaceSize); \ + workspaceAddr = workspace_allocator.get(); \ + } \ + if (CTX.async_mode) { \ + auto task = \ + std::make_unique(aclnn##OP_NAME, workspaceAddr, workspaceSize, executor, CTX.stream()); \ + CTX.task_queue.submit_task(std::move(task)); \ + } else { \ + ACL_CHECK(aclnn##OP_NAME(workspaceAddr, workspaceSize, executor, CTX.stream())); \ + } \ } while (0) /** @@ -947,11 +934,10 @@ class async_memset_task : public cann_task { * @param ctx Backend context which manages task submission and async mode. * @param args Pointers to ACL resources to be released. */ -template -void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) { +template void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... args) { std::vector resources; register_acl_resources(resources, std::forward(args)...); - if(ctx.async_mode) { + if (ctx.async_mode) { auto task = std::make_unique(std::move(resources)); ctx.task_queue.submit_task(std::move(task)); } @@ -966,8 +952,11 @@ void ggml_cann_release_resources(ggml_backend_cann_context & ctx, Args &&... arg * @param len Size of memory to copy (in bytes). * @param kind Type of memory copy (host-to-device, device-to-host, etc). */ -inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst, - const void * src, size_t len, aclrtMemcpyKind kind) { +inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, + void * dst, + const void * src, + size_t len, + aclrtMemcpyKind kind) { if (ctx.async_mode) { auto task = std::make_unique(dst, const_cast(src), len, kind, ctx.stream()); ctx.task_queue.submit_task(std::move(task)); @@ -976,8 +965,11 @@ inline void ggml_cann_async_memcpy(ggml_backend_cann_context & ctx, void * dst, } } -inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst, - const void * src, size_t len, aclrtMemcpyKind kind) { +inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, + void * dst, + const void * src, + size_t len, + aclrtMemcpyKind kind) { if (ctx->async_mode) { auto task = std::make_unique(dst, const_cast(src), len, kind, ctx->stream()); ctx->task_queue.submit_task(std::move(task)); @@ -994,8 +986,7 @@ inline void ggml_cann_async_memcpy(ggml_backend_cann_context * ctx, void * dst, * @param size Size of the memory buffer (in bytes). * @param value Value to set in the buffer. */ -inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer, - size_t size, int value) { +inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffer, size_t size, int value) { if (ctx.async_mode) { auto task = std::make_unique(buffer, size, value, ctx.stream()); ctx.task_queue.submit_task(std::move(task)); @@ -1029,7 +1020,7 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe * @param dst The destination tensor where the expert-weighted token outputs are stored. * Expected to be of shape [M, K, N, 1]. */ -void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst); /** * @brief Check whether a tensor is a weight tensor for matrix multiplication. @@ -1041,20 +1032,14 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst); * * @param tensor Pointer to the target ggml_tensor object (const-qualified). */ -static bool is_matmul_weight(const ggml_tensor* tensor) { - std::string name = ggml_get_name(tensor); - static const std::unordered_set weight_suffixes{ - "output.weight", - "attn_q.weight", - "attn_k.weight", - "attn_v.weight", - "attn_output.weight", - "ffn_gate.weight", - "ffn_up.weight", - "ffn_down.weight" - }; - - for (const auto& suffix : weight_suffixes) { +static bool is_matmul_weight(const ggml_tensor * tensor) { + std::string name = ggml_get_name(tensor); + static const std::unordered_set weight_suffixes{ "output.weight", "attn_q.weight", + "attn_k.weight", "attn_v.weight", + "attn_output.weight", "ffn_gate.weight", + "ffn_up.weight", "ffn_down.weight" }; + + for (const auto & suffix : weight_suffixes) { if (name.find(suffix) != std::string::npos) { return true; } @@ -1078,14 +1063,13 @@ static bool is_matmul_weight(const ggml_tensor* tensor) { * @param ctx The CANN backend context used to manage execution and resources. * @param dst The destination tensor. */ -template -void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; +template void ggml_cann_binary_op(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src0 = dst->src[0]; + ggml_tensor * src1 = dst->src[1]; - aclTensor* acl_src0; - aclTensor* acl_src1; - aclTensor* acl_dst; + aclTensor * acl_src0; + aclTensor * acl_src1; + aclTensor * acl_dst; // Need bcast bcast_shape(src0, src1, dst, &acl_src0, &acl_src1, &acl_dst); @@ -1094,7 +1078,6 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_cann_release_resources(ctx, acl_src0, acl_src1, acl_dst); } - /** * @brief Applies a unary operation to an input tensor using the CANN backend. * @@ -1107,12 +1090,12 @@ void ggml_cann_binary_op(ggml_backend_cann_context& ctx, ggml_tensor* dst) { * @param ctx The CANN backend context for managing resources and execution. * @param dst The destination tensor. Its src[0] is treated as the input tensor. */ -template - void ggml_cann_op_unary(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; +template +void ggml_cann_op_unary(ggml_backend_cann_context & ctx, ggml_tensor * dst) { + ggml_tensor * src = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); - aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor * acl_src = ggml_cann_create_tensor(src); + aclTensor * acl_dst = ggml_cann_create_tensor(dst); unary_op(ctx, acl_src, acl_dst); ggml_cann_release_resources(ctx, acl_src, acl_dst); @@ -1138,9 +1121,9 @@ template * * @see GGML_CANN_CALL_OP_UNARY */ -void ggml_cann_op_unary( - std::function unary_op, - ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_op_unary(std::function unary_op, + ggml_backend_cann_context & ctx, + ggml_tensor * dst); /** * @brief Applies a gated (GLU-style) unary operation using the CANN backend. @@ -1172,9 +1155,9 @@ void ggml_cann_op_unary( * * @see GGML_CANN_CALL_OP_UNARY_GATED */ -void ggml_cann_op_unary_gated( - std::function unary_op, - ggml_backend_cann_context& ctx, ggml_tensor* dst); +void ggml_cann_op_unary_gated(std::function unary_op, + ggml_backend_cann_context & ctx, + ggml_tensor * dst); /** * @brief Helper macro to call a unary ACL operator via ggml_cann_op_unary. @@ -1197,16 +1180,13 @@ void ggml_cann_op_unary_gated( * @see ggml_cann_op_unary * @see GGML_CANN_CALL_ACLNN_OP */ -#define GGML_CANN_CALL_OP_UNARY(OP_NAME) \ - do { \ - auto lambda = [](ggml_backend_cann_context& ctx, \ - aclTensor* acl_src, \ - aclTensor* acl_dst) { \ - GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \ - }; \ - ggml_cann_op_unary(lambda, ctx, dst); \ - } \ - while (0) +#define GGML_CANN_CALL_OP_UNARY(OP_NAME) \ + do { \ + auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \ + GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \ + }; \ + ggml_cann_op_unary(lambda, ctx, dst); \ + } while (0) /** * @brief Helper macro to call a gated unary ACL operator via ggml_cann_op_unary_gated. @@ -1229,15 +1209,12 @@ void ggml_cann_op_unary_gated( * @see ggml_cann_op_unary_gated * @see GGML_CANN_CALL_ACLNN_OP */ -#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \ - do { \ - auto lambda = [](ggml_backend_cann_context& ctx, \ - aclTensor* acl_src, \ - aclTensor* acl_dst) { \ - GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \ - }; \ - ggml_cann_op_unary_gated(lambda, ctx, dst); \ - } \ - while (0) +#define GGML_CANN_CALL_OP_UNARY_GATED(OP_NAME) \ + do { \ + auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { \ + GGML_CANN_CALL_ACLNN_OP(ctx, OP_NAME, acl_src, acl_dst); \ + }; \ + ggml_cann_op_unary_gated(lambda, ctx, dst); \ + } while (0) #endif // CANN_ACLNN_OPS diff --git a/ggml/src/ggml-cann/common.h b/ggml/src/ggml-cann/common.h old mode 100755 new mode 100644 index debbcadc1e4c5..e87dbcf329ff2 --- a/ggml/src/ggml-cann/common.h +++ b/ggml/src/ggml-cann/common.h @@ -44,7 +44,7 @@ #include "../include/ggml.h" #include "../ggml-impl.h" -#define MATRIX_ROW_PADDING 512 +#define MATRIX_ROW_PADDING 512 #define GGML_CANN_MAX_STREAMS 8 /** @@ -56,8 +56,7 @@ * @param line The line number at which the error occurred. * @param msg The error message. */ -[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, - const char* file, int line, const char* msg); +[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg); /** * @brief Checks the result of a CANN function call and invokes the error @@ -89,25 +88,24 @@ struct ggml_cann_device_info { * @brief Information about a single CANN device. */ struct cann_device_info { - int cc; /**< Compute capability. */ + int cc; /**< Compute capability. */ size_t smpb; /**< Maximum shared memory per block. */ - bool vmm; /**< Virtual memory support. */ + bool vmm; /**< Virtual memory support. */ size_t vmm_granularity; /**< Granularity of virtual memory. */ size_t total_vram; /**< Total video RAM available on the device. */ }; - cann_device_info devices[GGML_CANN_MAX_DEVICES] = - {}; /**< Array of CANN device information. */ + cann_device_info devices[GGML_CANN_MAX_DEVICES] = {}; /**< Array of CANN device information. */ }; -const ggml_cann_device_info& ggml_cann_info(); +const ggml_cann_device_info & ggml_cann_info(); -void ggml_cann_set_device(int32_t device); +void ggml_cann_set_device(int32_t device); int32_t ggml_cann_get_device(); -std::optional get_env(const std::string& name); -bool parse_bool(const std::string& value); -int parse_integer(const std::string& value); +std::optional get_env(const std::string & name); +bool parse_bool(const std::string & value); +int parse_integer(const std::string & value); /** * @brief Abstract base class for memory pools used by CANN. @@ -126,7 +124,7 @@ struct ggml_cann_pool { * will be stored. * @return Pointer to the allocated memory block. */ - virtual void* alloc(size_t size, size_t* actual_size) = 0; + virtual void * alloc(size_t size, size_t * actual_size) = 0; /** * @brief Frees a previously allocated memory block. @@ -136,16 +134,16 @@ struct ggml_cann_pool { * @note Note that all CANN opertors are running async. Make sure memory is * still avaiable before this operator finished. */ - virtual void free(void* ptr, size_t size) = 0; + virtual void free(void * ptr, size_t size) = 0; }; /** * @brief RAII wrapper for managing memory allocations from a CANN memory pool. */ struct ggml_cann_pool_alloc { - ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */ - void* ptr = nullptr; /**< Pointer to the allocated memory block. */ - size_t actual_size = 0; /**< Actual size of the allocated memory block. */ + ggml_cann_pool * pool = nullptr; /**< Pointer to the memory pool. */ + void * ptr = nullptr; /**< Pointer to the allocated memory block. */ + size_t actual_size = 0; /**< Actual size of the allocated memory block. */ /** * @brief Default constructor. @@ -156,16 +154,14 @@ struct ggml_cann_pool_alloc { * @brief Constructor that initializes the memory pool. * @param pool Reference to the memory pool. */ - explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {} + explicit ggml_cann_pool_alloc(ggml_cann_pool & pool) : pool(&pool) {} /** * @brief Constructor that initializes the memory pool and allocates memory. * @param pool Reference to the memory pool. * @param size Size of the memory block to allocate. */ - ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) { - alloc(size); - } + ggml_cann_pool_alloc(ggml_cann_pool & pool, size_t size) : pool(&pool) { alloc(size); } /** * @brief Destructor that frees the allocated memory block. @@ -181,7 +177,7 @@ struct ggml_cann_pool_alloc { * @param size Size of the memory block to allocate. * @return Pointer to the allocated memory block. */ - void* alloc(size_t size) { + void * alloc(size_t size) { GGML_ASSERT(pool != nullptr); GGML_ASSERT(ptr == nullptr); ptr = pool->alloc(size, &this->actual_size); @@ -194,7 +190,7 @@ struct ggml_cann_pool_alloc { * @param size Size of the memory block to allocate. * @return Pointer to the allocated memory block. */ - void* alloc(ggml_cann_pool& pool, size_t size) { + void * alloc(ggml_cann_pool & pool, size_t size) { this->pool = &pool; return alloc(size); } @@ -203,25 +199,25 @@ struct ggml_cann_pool_alloc { * @brief Gets the pointer to the allocated memory block. * @return Pointer to the allocated memory block. */ - void* get() { return ptr; } + void * get() { return ptr; } // Deleted copy constructor - ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete; + ggml_cann_pool_alloc(const ggml_cann_pool_alloc &) = delete; // Deleted move constructor - ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete; + ggml_cann_pool_alloc(ggml_cann_pool_alloc &&) = delete; // Deleted copy assignment operator - ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete; + ggml_cann_pool_alloc & operator=(const ggml_cann_pool_alloc &) = delete; // Deleted move assignment operator - ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete; + ggml_cann_pool_alloc & operator=(ggml_cann_pool_alloc &&) = delete; }; /** * @brief Function pointer type for ACLNN operator calls. */ -using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStream); +using aclnn_func_t = aclnnStatus (*)(void *, uint64_t, aclOpExecutor *, aclrtStream); /** * @brief Base class for all CANN tasks to be submitted to the task queue. @@ -229,7 +225,7 @@ using aclnn_func_t = aclnnStatus (*)(void*, uint64_t, aclOpExecutor*, aclrtStrea * Users should override the run_task() method with actual task logic. */ class cann_task { -public: + public: virtual void run_task() {} }; @@ -237,16 +233,20 @@ class cann_task { * @brief A lock-free ring-buffer based task queue for asynchronously executing cann_task instances. */ class cann_task_queue { -public: + public: /** * @brief Constructs a task queue with a fixed power-of-two capacity for a specific device. * * @param capacity Queue capacity. Must be a power of 2. * @param device Target device ID (used for context setting). */ - explicit cann_task_queue(size_t capacity, int32_t device) - : buffer_(capacity), capacity_(capacity), head_(0), tail_(0), - running_(false), device_(device) { + explicit cann_task_queue(size_t capacity, int32_t device) : + buffer_(capacity), + capacity_(capacity), + head_(0), + tail_(0), + running_(false), + device_(device) { GGML_ASSERT((capacity & (capacity - 1)) == 0 && "capacity must be power of 2"); mask_ = capacity_ - 1; } @@ -257,7 +257,7 @@ class cann_task_queue { * @param item Unique pointer to the task. * @return true if the task was successfully enqueued, false if the queue was full. */ - bool enqueue(std::unique_ptr&& item) { + bool enqueue(std::unique_ptr && item) { size_t next_tail = (tail_ + 1) & mask_; if (next_tail == head_) { @@ -276,17 +276,16 @@ class cann_task_queue { * * @param task Task to be submitted. */ - void submit_task(std::unique_ptr&& task) { - while(!enqueue(std::move(task))) { + void submit_task(std::unique_ptr && task) { + while (!enqueue(std::move(task))) { std::this_thread::yield(); continue; } if (!running_) { running_ = true; - thread_ = std::thread(&cann_task_queue::execute, this); + thread_ = std::thread(&cann_task_queue::execute, this); } - } /** @@ -309,7 +308,7 @@ class cann_task_queue { } } -private: + private: /** * @brief Worker thread function that continuously dequeues and executes tasks. */ @@ -317,7 +316,7 @@ class cann_task_queue { ggml_cann_set_device(device_); while (running_) { - if(head_ == tail_) { + if (head_ == tail_) { std::this_thread::yield(); continue; } @@ -330,24 +329,24 @@ class cann_task_queue { } std::vector> buffer_; - const size_t capacity_; - size_t mask_; - size_t head_; - size_t tail_; - bool running_; - std::thread thread_; - int32_t device_; + const size_t capacity_; + size_t mask_; + size_t head_; + size_t tail_; + bool running_; + std::thread thread_; + int32_t device_; }; #ifdef USE_ACL_GRAPH struct ggml_graph_node_properties { // dst tensor - void * node_address; + void * node_address; int64_t ne[GGML_MAX_DIMS]; - size_t nb[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; // src tensor - void * src_address[GGML_MAX_SRC]; + void * src_address[GGML_MAX_SRC]; int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS]; size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS]; @@ -376,13 +375,11 @@ struct ggml_cann_graph { * move existing graphs to the front (most recently used), and clear the cache. */ struct ggml_cann_graph_lru_cache { - size_t capacity; /**< Maximum number of graphs in the cache. */ + size_t capacity; /**< Maximum number of graphs in the cache. */ - std::list cache_list; /**< List storing cached graphs as raw pointers. */ + std::list cache_list; /**< List storing cached graphs as raw pointers. */ - ggml_cann_graph_lru_cache() { - capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); - } + ggml_cann_graph_lru_cache() { capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12")); } /** * @brief Push a new graph to the front of the cache. @@ -390,11 +387,11 @@ struct ggml_cann_graph_lru_cache { * @param new_node Pointer to the new ggml_cann_graph to cache. * Ownership is transferred to the cache (cache will delete it). */ - void push(ggml_cann_graph* new_node) { + void push(ggml_cann_graph * new_node) { if (cache_list.size() >= capacity) { - ggml_cann_graph* old = cache_list.back(); + ggml_cann_graph * old = cache_list.back(); cache_list.pop_back(); - delete old; // free the old graph + delete old; // free the old graph } cache_list.push_front(new_node); } @@ -403,7 +400,7 @@ struct ggml_cann_graph_lru_cache { * @brief Move an existing graph to the front of the cache. * @param node Pointer to the ggml_cann_graph to move. */ - void move_to_front(ggml_cann_graph* node) { + void move_to_front(ggml_cann_graph * node) { cache_list.remove(node); cache_list.push_front(node); } @@ -421,92 +418,89 @@ struct ggml_cann_graph_lru_cache { /** * @brief Destructor that clears the cache and frees all cached graphs. */ - ~ggml_cann_graph_lru_cache() { - clear(); - } + ~ggml_cann_graph_lru_cache() { clear(); } }; #endif // USE_ACL_GRAPH struct ggml_cann_rope_cache { ~ggml_cann_rope_cache() { - if(theta_scale_cache != nullptr) { + if (theta_scale_cache != nullptr) { ACL_CHECK(aclrtFree(theta_scale_cache)); } - if(sin_cache != nullptr) { + if (sin_cache != nullptr) { ACL_CHECK(aclrtFree(sin_cache)); } - if(cos_cache != nullptr) { + if (cos_cache != nullptr) { ACL_CHECK(aclrtFree(cos_cache)); } } - void* theta_scale_cache = nullptr; + void * theta_scale_cache = nullptr; int64_t theta_scale_length = 0; // sin/cos cache, used only to accelerate first layer on each device - void* sin_cache = nullptr; - void* cos_cache = nullptr; - int64_t position_length = 0; + void * sin_cache = nullptr; + void * cos_cache = nullptr; + int64_t position_length = 0; // Properties to check before reusing the sincos cache - bool cached = false; - float ext_factor = 0.0f; - float theta_scale = 0.0f; - float freq_scale = 0.0f; - float attn_factor = 0.0f; - bool is_neox = false; + bool cached = false; + float ext_factor = 0.0f; + float theta_scale = 0.0f; + float freq_scale = 0.0f; + float attn_factor = 0.0f; + bool is_neox = false; }; struct ggml_cann_tensor_cache { ~ggml_cann_tensor_cache() { - if(cache != nullptr) { + if (cache != nullptr) { ACL_CHECK(aclrtFree(cache)); } } - void* cache = nullptr; - int64_t size = 0; + void * cache = nullptr; + int64_t size = 0; }; /** * @brief Context for managing CANN backend operations. */ struct ggml_backend_cann_context { - int32_t device; /**< Device ID. */ - std::string name; /**< Name of the device. */ - std::string description; /**< Description of the device. */ - aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ + int32_t device; /**< Device ID. */ + std::string name; /**< Name of the device. */ + std::string description; /**< Description of the device. */ + aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ #ifdef USE_ACL_GRAPH /// Cached CANN ACL graph used for executing the current ggml computation graph. ggml_cann_graph_lru_cache graph_lru_cache; - bool acl_graph_mode = true; + bool acl_graph_mode = true; #endif - cann_task_queue task_queue; - bool async_mode; + cann_task_queue task_queue; + bool async_mode; // Rope Cache - ggml_cann_rope_cache rope_cache; + ggml_cann_rope_cache rope_cache; // Constant Pool ggml_cann_tensor_cache rms_norm_one_tensor_cache; ggml_cann_tensor_cache rms_norm_zero_tensor_cache; - aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ + aclrtStream streams[GGML_CANN_MAX_STREAMS] = { nullptr }; /**< Array of streams for the device. */ /** * @brief Constructor for initializing the context with a given device. * @param device Device ID. */ - explicit ggml_backend_cann_context(int device) - : device(device), name("CANN" + std::to_string(device)), task_queue(1024, device) { + explicit ggml_backend_cann_context(int device) : + device(device), + name("CANN" + std::to_string(device)), + task_queue(1024, device) { ggml_cann_set_device(device); description = aclrtGetSocName(); async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or("")); - GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, - device, async_mode ? "ON" : "OFF"); + GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__, device, async_mode ? "ON" : "OFF"); #ifdef USE_ACL_GRAPH acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on")); - GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", - __func__, device, - acl_graph_mode ? "GRAPH" : "EAGER", - acl_graph_mode ? "acl graph enabled" : "acl graph disabled"); + GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n", __func__, device, acl_graph_mode ? "GRAPH" : "EAGER", + acl_graph_mode ? "acl graph enabled" : "acl graph disabled"); #endif } @@ -549,8 +543,7 @@ struct ggml_backend_cann_context { aclrtStream stream() { return stream(0); } // TODO: each stream should have a memory pool. - std::unique_ptr - mem_pool; /**< Memory pool for the device. */ + std::unique_ptr mem_pool; /**< Memory pool for the device. */ /** * @brief Create a new memory pool for a given device. @@ -563,7 +556,7 @@ struct ggml_backend_cann_context { * @brief Get or create the memory pool for the context. * @return Reference to the memory pool. */ - ggml_cann_pool& pool() { + ggml_cann_pool & pool() { if (mem_pool == nullptr) { mem_pool = new_pool_for_device(device); } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp old mode 100755 new mode 100644 index ad1adba6b3a8a..8bd5449f1f75f --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -56,14 +56,12 @@ * @param line The line number where the error occurred. * @param msg The error message. */ -[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, - const char* file, int line, const char* msg) { +[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int32_t id = -1; aclrtGetDevice(&id); GGML_LOG_ERROR("CANN error: %s\n", msg); - GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, - file, line); + GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_LOG_ERROR(" %s\n", stmt); // abort with GGML_ASSERT to get a stack trace GGML_ABORT("CANN error"); @@ -79,7 +77,7 @@ void ggml_cann_set_device(const int32_t device) { aclrtGetDevice(¤t_device); if (device == current_device) { - return; + return; } ACL_CHECK(aclrtSetDevice(device)); } @@ -99,9 +97,11 @@ int32_t ggml_cann_get_device() { * @brief Get the value of the specified environment variable (name). * if not empty, return a std::string object */ -std::optional get_env(const std::string& name) { - const char* val = std::getenv(name.c_str()); - if (!val) return std::nullopt; +std::optional get_env(const std::string & name) { + const char * val = std::getenv(name.c_str()); + if (!val) { + return std::nullopt; + } std::string res = std::string(val); std::transform(res.begin(), res.end(), res.begin(), ::tolower); return res; @@ -110,8 +110,8 @@ std::optional get_env(const std::string& name) { /** * @brief Verify whether the environment variable is a valid value. */ -bool parse_bool(const std::string& value) { - std::unordered_set valid_values = {"on", "1", "yes", "y", "enable", "true"}; +bool parse_bool(const std::string & value) { + std::unordered_set valid_values = { "on", "1", "yes", "y", "enable", "true" }; return valid_values.find(value) != valid_values.end(); } @@ -125,7 +125,7 @@ bool parse_bool(const std::string& value) { * @param value The string to parse. * @return The parsed integer, or 0 if conversion fails. */ -int parse_integer(const std::string& value) { +int parse_integer(const std::string & value) { try { return std::stoi(value); } catch (...) { @@ -144,11 +144,10 @@ int parse_integer(const std::string& value) { static ggml_cann_device_info ggml_cann_init() { ggml_cann_device_info info = {}; - aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count); + aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count); if (err != ACL_SUCCESS) { - GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", - __func__, aclGetRecentErrMsg()); + GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg()); return info; } @@ -156,16 +155,15 @@ static ggml_cann_device_info ggml_cann_init() { for (int id = 0; id < info.device_count; ++id) { aclrtPhysicalMemProp prop = {}; - prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; - prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; - prop.memAttr = ACL_HBM_MEM_HUGE; - prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = id; - prop.reserve = 0; - err = aclrtMemGetAllocationGranularity( - &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, - &info.devices[id].vmm_granularity); - info.devices[id].vmm = err == ACL_SUCCESS; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = id; + prop.reserve = 0; + err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, + &info.devices[id].vmm_granularity); + info.devices[id].vmm = err == ACL_SUCCESS; size_t free, total; ggml_backend_cann_get_device_memory(id, &free, &total); @@ -185,7 +183,7 @@ static ggml_cann_device_info ggml_cann_init() { * * @return A reference to the structure containing the device information. */ -const ggml_cann_device_info& ggml_cann_info() { +const ggml_cann_device_info & ggml_cann_info() { static ggml_cann_device_info info = ggml_cann_init(); return info; } @@ -205,7 +203,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { /** * @brief The minimum free margin for a buffer. */ - static const size_t min_free_margin = 1ull << 20; // 1MB + static const size_t min_free_margin = 1ull << 20; // 1MB /** * @brief The alignment for buffer allocation. @@ -226,22 +224,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { * @brief Structure representing a CANN buffer. */ struct ggml_cann_buffer { - void* ptr = nullptr; ///< Pointer to the buffer. - size_t size = 0; ///< Size of the buffer. - std::chrono::steady_clock::time_point last_used; ///< Last used time. + void * ptr = nullptr; ///< Pointer to the buffer. + size_t size = 0; ///< Size of the buffer. + std::chrono::steady_clock::time_point last_used; ///< Last used time. - bool operator>(const ggml_cann_buffer& other) const { - return size > other.size; - } + bool operator>(const ggml_cann_buffer & other) const { return size > other.size; } }; /** * @brief Array of CANN buffers in the pool. */ - std::unordered_map buffer_pool; - std::priority_queue, - std::greater<>> free_buffers ; + std::unordered_map buffer_pool; + std::priority_queue, std::greater<>> free_buffers; /** * @brief Total size of all buffers in the pool. @@ -262,7 +256,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { */ ~ggml_cann_pool_buf_prio() { ggml_cann_set_device(device); - for (auto& [b_ptr, b_size] : buffer_pool) { + for (auto & [b_ptr, b_size] : buffer_pool) { aclrtFree(b_ptr); pool_size -= b_size; } @@ -278,14 +272,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { * the allocated buffer. * @return A pointer to the allocated buffer. */ - void* alloc(size_t size, size_t* actual_size) override { + void * alloc(size_t size, size_t * actual_size) override { size = GGML_PAD(size, alignment); if (size == 0) { size = alignment; } - void* ptr = nullptr; - auto now = std::chrono::steady_clock::now(); + void * ptr = nullptr; + auto now = std::chrono::steady_clock::now(); std::vector free_buffers_rest; free_buffers_rest.reserve(free_buffers.size()); @@ -298,24 +292,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { const size_t margin = b.size - size; if (margin <= max_reuse_margin) { *actual_size = b.size; - ptr = b.ptr; + ptr = b.ptr; #ifdef DEBUG_CANN_MALLOC GGML_LOG_INFO( "cann pool[%d]: reused %p, " "pool_size = %5u MB, " "size = %5u MB, " "margin = %5u MB\n", - device, b.ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(margin, 1048576) / 1048576)); + device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(margin, 1048576) / 1048576)); #endif break; } } - bool should_clean = !disable_clean && - b.size > min_free_margin && + bool should_clean = !disable_clean && b.size > min_free_margin && std::chrono::duration_cast(now - b.last_used).count() > 100; if (should_clean) { // free the buffer if the size is needed to be freed @@ -327,20 +319,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { "cann pool[%d]: clean %p, " "pool_size = %5u MB, " "size = %5u MB\n", - device, b.ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576)); + device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576)); #endif continue; } free_buffers_rest.push_back(b); } - for (ggml_cann_buffer &b : free_buffers_rest) { + for (ggml_cann_buffer & b : free_buffers_rest) { free_buffers.push(std::move(b)); } #ifdef DEBUG_CANN_MALLOC - GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576)); + GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device, + (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576)); #endif if (ptr != nullptr) { return ptr; @@ -356,8 +348,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { "cann pool[%d]: allocate %p, " "pool_size = %5u MB, " "size = %5u MB\n", - device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(size, 1048576) / 1048576)); + device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(size, 1048576) / 1048576)); #endif buffer_pool.emplace(ptr, size); return ptr; @@ -369,7 +361,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { * @param ptr Pointer to the buffer to free. * @param size Size of the buffer to free. */ - void free(void* ptr, size_t size) override { + void free(void * ptr, size_t size) override { GGML_UNUSED(size); auto it = buffer_pool.find(ptr); if (it == buffer_pool.end()) { @@ -377,13 +369,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool { } auto now = std::chrono::steady_clock::now(); - free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now}); + free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now }); #ifdef DEBUG_CANN_MALLOC GGML_LOG_INFO( "cann pool[%d]: return %p, " "pool_size = %5u MB\n", - device, ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576)); + device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576)); #endif } }; @@ -402,7 +393,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { /** * @brief The minimum free margin for a buffer. */ - static const size_t min_free_margin = 1ull << 20; // 1MB + static const size_t min_free_margin = 1ull << 20; // 1MB /** * @brief The alignment for buffer allocation. @@ -428,10 +419,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { * @brief Structure representing a CANN buffer. */ struct ggml_cann_buffer { - void* ptr = nullptr; ///< Pointer to the buffer memory. - size_t size = 0; ///< Size of the buffer. - bool used = false; ///< Whether the buffer is currently in use. - std::chrono::steady_clock::time_point last_used; ///< Last used time. + void * ptr = nullptr; ///< Pointer to the buffer memory. + size_t size = 0; ///< Size of the buffer. + bool used = false; ///< Whether the buffer is currently in use. + std::chrono::steady_clock::time_point last_used; ///< Last used time. }; /** @@ -459,7 +450,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { ~ggml_cann_pool_buf() { ggml_cann_set_device(device); for (int i = 0; i < MAX_BUFFERS; ++i) { - ggml_cann_buffer& b = buffer_pool[i]; + ggml_cann_buffer & b = buffer_pool[i]; if (b.ptr != nullptr) { aclrtFree(b.ptr); pool_size -= b.size; @@ -476,18 +467,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { * the allocated buffer. * @return A pointer to the allocated buffer. */ - void* alloc(size_t size, size_t* actual_size) override { + void * alloc(size_t size, size_t * actual_size) override { size = GGML_PAD(size, alignment); if (size == 0) { size = alignment; } - void* ptr = nullptr; - auto now = std::chrono::steady_clock::now(); + void * ptr = nullptr; + auto now = std::chrono::steady_clock::now(); int i = 0; for (; i < MAX_BUFFERS; ++i) { - ggml_cann_buffer& b = buffer_pool[i]; + ggml_cann_buffer & b = buffer_pool[i]; if (b.ptr == nullptr) { break; } @@ -499,25 +490,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { const size_t margin = b.size - size; if (margin <= max_reuse_margin) { *actual_size = b.size; - b.used = true; - ptr = b.ptr; + b.used = true; + ptr = b.ptr; #ifdef DEBUG_CANN_MALLOC GGML_LOG_INFO( "cann pool[%d]: reused %p, " "pool_size = %5u MB, " "size = %5u MB, " "margin = %5u MB\n", - device, b.ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(margin, 1048576) / 1048576)); + device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(margin, 1048576) / 1048576)); #endif break; } } - bool should_clean = !disable_clean && - b.size > min_free_margin && + bool should_clean = !disable_clean && b.size > min_free_margin && std::chrono::duration_cast(now - b.last_used).count() > 100; if (should_clean) { // free the buffer if the size is needed to be freed @@ -528,9 +517,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { "cann pool[%d]: clean %p, " "pool_size = %5u MB, " "size = %5u MB\n", - device, b.ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576)); + device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576)); #endif b.ptr = nullptr; } @@ -541,13 +529,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { if (i < MAX_BUFFERS) { // allocate a new buffer if no buffer can be reused - ggml_cann_buffer& b = buffer_pool[i]; + ggml_cann_buffer & b = buffer_pool[i]; ggml_cann_set_device(device); ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); pool_size += size; *actual_size = size; - b.size = size; - b.used = true; + b.size = size; + b.used = true; if (i >= MAX_BUFFERS - 8) { GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device); } @@ -556,9 +544,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { "cann pool[%d]: allocate %p, " "pool_size = %5u MB, " "size = %5u MB\n", - device, b.ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576), - (uint32_t)(GGML_PAD(b.size, 1048576) / 1048576)); + device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576), + (uint32_t) (GGML_PAD(b.size, 1048576) / 1048576)); #endif return b.ptr; } @@ -572,21 +559,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool { * @param ptr Pointer to the buffer to free. * @param size Size of the buffer to free. */ - void free(void* ptr, size_t size) override { + void free(void * ptr, size_t size) override { GGML_UNUSED(size); for (int i = 0; i < MAX_BUFFERS; ++i) { - ggml_cann_buffer& b = buffer_pool[i]; + ggml_cann_buffer & b = buffer_pool[i]; if (b.ptr != ptr) { continue; } - b.used = false; + b.used = false; b.last_used = std::chrono::steady_clock::now(); #ifdef DEBUG_CANN_MALLOC GGML_LOG_INFO( "cann pool[%d]: return %p, " "pool_size = %5u MB\n", - device, b.ptr, - (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576)); + device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576)); #endif return; } @@ -614,7 +600,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { /** * @brief Pointer to the start of the virtual memory pool. */ - void* pool_addr = 0; + void * pool_addr = 0; /** * @brief Amount of virtual memory used in the pool. @@ -639,7 +625,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { /** * @brief Offsets for the mapped memory regions. */ - std::vector map_offsets; + std::vector map_offsets; /** * @brief Constructor to initialize the buffer pool with virtual memory for @@ -647,11 +633,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { * * @param device The device ID to associate with this buffer pool. */ - explicit ggml_cann_pool_vmm(int device) - : device(device) { - auto dev = ggml_cann_info().devices[device]; + explicit ggml_cann_pool_vmm(int device) : device(device) { + auto dev = ggml_cann_info().devices[device]; granularity = dev.vmm_granularity; - max_size = dev.total_vram; + max_size = dev.total_vram; } /** @@ -659,10 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { */ ~ggml_cann_pool_vmm() { if (pool_addr != 0) { - for (auto& offset : map_offsets) { + for (auto & offset : map_offsets) { ACL_CHECK(aclrtUnmapMem(offset)); } - for (auto& handle : handles) { + for (auto & handle : handles) { ACL_CHECK(aclrtFreePhysical(handle)); } ACL_CHECK(aclrtReleaseMemAddress(pool_addr)); @@ -677,11 +662,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { * the allocated buffer. * @return A pointer to the allocated buffer. */ - void* alloc(size_t size, size_t* actual_size) override { + void * alloc(size_t size, size_t * actual_size) override { // round up the allocation size to the alignment to ensure that all // allocations are aligned for all data types const size_t alignment = 128; - size = GGML_PAD(size, alignment); + size = GGML_PAD(size, alignment); if (size == 0) { size = alignment; } @@ -691,53 +676,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { if (size > avail) { // round up to the next multiple of the granularity size_t reserve_size = size - avail; - reserve_size = GGML_PAD(reserve_size, granularity); + reserve_size = GGML_PAD(reserve_size, granularity); GGML_ASSERT(pool_size + reserve_size <= max_size); // allocate more physical memory aclrtPhysicalMemProp prop = {}; - prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; - prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; - prop.memAttr = ACL_HBM_MEM_HUGE; - prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - prop.reserve = 0; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.reserve = 0; aclrtDrvMemHandle handle; ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0)); // reserve virtual address space (if not already reserved) if (pool_addr == 0) { - ACL_CHECK(aclrtReserveMemAddress( - &pool_addr, max_size, 0, NULL, 1)); + ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1)); } // map at the end of the pool - ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0, - handle, 0)); + ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0)); handles.push_back(handle); - map_offsets.push_back((char*)pool_addr + pool_size); + map_offsets.push_back((char *) pool_addr + pool_size); // add to the pool pool_size += reserve_size; #ifdef DEBUG_CANN_MALLOC - GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", - device, (unsigned long long) (pool_size/1024/1024), - (unsigned long long) (reserve_size/1024/1024)); + GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device, + (unsigned long long) (pool_size / 1024 / 1024), + (unsigned long long) (reserve_size / 1024 / 1024)); #endif } GGML_ASSERT(pool_addr != 0); - void* ptr = (void*)((char*)pool_addr + pool_used); + void * ptr = (void *) ((char *) pool_addr + pool_used); *actual_size = size; pool_used += size; #ifdef DEBUG_CANN_MALLOC - GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, - (unsigned long long)size, (unsigned long long)ptr); + GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, + (unsigned long long) ptr); #endif return ptr; } @@ -748,16 +731,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { * @param ptr Pointer to the buffer to free. * @param size Size of the buffer to free. */ - void free(void* ptr, size_t size) override { + void free(void * ptr, size_t size) override { #ifdef DEBUG_CANN_MALLOC - GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, - (unsigned long long)size, (unsigned long long)ptr); + GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, + (unsigned long long) ptr); #endif pool_used -= size; // all deallocations must be in reverse order of the allocations - GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used)); + GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used)); } }; @@ -769,8 +752,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool { * @param device The device ID for which to create the pool. * @return A unique pointer to the created CANN pool. */ -std::unique_ptr ggml_backend_cann_context::new_pool_for_device( - int device) { +std::unique_ptr ggml_backend_cann_context::new_pool_for_device(int device) { std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or(""); if (mem_pool_type == "prio") { @@ -795,9 +777,8 @@ std::unique_ptr ggml_backend_cann_context::new_pool_for_device( * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID. */ struct ggml_backend_cann_buffer_context { - int32_t device; ///< The device ID associated with this buffer context. - void* dev_ptr = - nullptr; ///< Pointer to the device memory allocated for the buffer. + int32_t device; ///< The device ID associated with this buffer context. + void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer. /** * @brief Constructor to initialize the CANN buffer context. @@ -805,9 +786,7 @@ struct ggml_backend_cann_buffer_context { * @param device The device ID associated with this buffer context. * @param dev_ptr Pointer to the device memory allocated for the buffer. */ - ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) - : device(device), - dev_ptr(dev_ptr) {} + ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} /** * @brief Destructor to free the device memory allocated for the buffer. @@ -825,8 +804,8 @@ struct ggml_backend_cann_buffer_context { * @return true if the buffer is a CANN buffer, false otherwise. */ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft); -static bool ggml_backend_buffer_is_cann( - ggml_backend_buffer_t buffer) { + +static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) { return ggml_backend_buft_is_cann(buffer->buft); } @@ -838,10 +817,8 @@ static bool ggml_backend_buffer_is_cann( * * @param buffer The CANN buffer to free. */ -static void ggml_backend_cann_buffer_free_buffer( - ggml_backend_buffer_t buffer) { - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; +static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; delete ctx; } @@ -854,10 +831,8 @@ static void ggml_backend_cann_buffer_free_buffer( * @param buffer The CANN buffer whose base pointer is to be retrieved. * @return A pointer to the base of the device memory allocated for the buffer. */ -static void* ggml_backend_cann_buffer_get_base( - ggml_backend_buffer_t buffer) { - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; +static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; return ctx->dev_ptr; } @@ -874,21 +849,17 @@ static void* ggml_backend_cann_buffer_get_base( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, - const void* src, - void* dst) { - - int64_t n_elems = ggml_nelements(tensor); - int64_t groups = n_elems / QK4_0; - size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; +static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; - uint8_t* quant_offset = (uint8_t*)dst; - uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + uint8_t * quant_offset = (uint8_t *) dst; + uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes); for (int i = 0; i < groups; i++) { - const block_q4_0* group = - (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0)); - *scale_offset = group->d; + const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0)); + *scale_offset = group->d; scale_offset++; // 0-15 @@ -907,8 +878,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, } // put (uint4b_t -8) into int4b_t - for (quant_offset = (uint8_t*)dst; - quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) { + for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) { (*quant_offset) ^= 0x88; } } @@ -926,29 +896,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where the Q4.0 formatted data * will be stored. */ -static void ggml_backend_cann_transform_back_q4_0( - const ggml_tensor* tensor, void* src, void* dst) { +static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; - int64_t n_elems = ggml_nelements(tensor); - int64_t groups = n_elems / QK4_0; - size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + uint8_t * quant_offset = (uint8_t *) src; + uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes); - uint8_t* quant_offset = (uint8_t*)src; - uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes); - - for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) { + for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) { (*quant_offset) ^= 0x88; } - quant_offset = (uint8_t*)src; + quant_offset = (uint8_t *) src; for (int i = 0; i < groups; i++) { - block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0)); - group->d = *scale_offset; + block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0)); + group->d = *scale_offset; scale_offset++; // 0-15 for (int j = 0; j < QK4_0 / 2; j += 2) { - group->qs[j] = ((*quant_offset) & 0x0F); + group->qs[j] = ((*quant_offset) & 0x0F); group->qs[j + 1] = ((*quant_offset) >> 4); quant_offset++; } @@ -975,20 +943,17 @@ static void ggml_backend_cann_transform_back_q4_0( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, - const void* src, - void* dst) { - int64_t n_elems = ggml_nelements(tensor); - int64_t groups = n_elems / QK8_0; - size_t quant_bytes = n_elems * sizeof(uint8_t); +static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); - uint8_t* quant_offset = (uint8_t*)dst; - uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + uint8_t * quant_offset = (uint8_t *) dst; + uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes); for (int i = 0; i < groups; i++) { - const block_q8_0* group = - (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0)); - *scale_offset = group->d; + const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0)); + *scale_offset = group->d; scale_offset++; size_t group_quant_size = QK8_0 * sizeof(uint8_t); memcpy(quant_offset, group->qs, group_quant_size); @@ -1009,19 +974,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where the Q8.0 formatted data * will be stored. */ -static void ggml_backend_cann_transform_back_q8_0( - const ggml_tensor* tensor, const void* src, void* dst) { - int64_t n_elems = ggml_nelements(tensor); - int64_t groups = n_elems / QK8_0; - size_t quant_bytes = n_elems * sizeof(uint8_t); +static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); - const uint8_t* quant_offset = (const uint8_t*)src; - const uint16_t* scale_offset = - (const uint16_t*)((const char*)src + quant_bytes); + const uint8_t * quant_offset = (const uint8_t *) src; + const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes); for (int i = 0; i < groups; i++) { - block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0)); - group->d = *scale_offset; + block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0)); + group->d = *scale_offset; scale_offset++; size_t group_quant_size = QK8_0 * sizeof(uint8_t); memcpy(group->qs, quant_offset, group_quant_size); @@ -1041,8 +1004,7 @@ static void ggml_backend_cann_transform_back_q8_0( * @param dst Pointer to the destination buffer where transformed data will be * stored. */ -static void ggml_backend_cann_transform(ggml_tensor* tensor, - const void* src, void* dst) { +static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) { switch (tensor->type) { case GGML_TYPE_Q4_0: ggml_backend_cann_transform_q4_0(tensor, src, dst); @@ -1067,8 +1029,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor, * @param dst Pointer to the destination buffer where transformed tensor data * will be stored. */ -static void ggml_backend_cann_transform_back( - const ggml_tensor* tensor, void* src, void* dst) { +static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) { switch (tensor->type) { case GGML_TYPE_Q4_0: ggml_backend_cann_transform_back_q4_0(tensor, src, dst); @@ -1109,8 +1070,7 @@ static bool need_transform(ggml_type type) { * @param buffer The CANN buffer from which to initialize the tensor. * @param tensor Pointer to the tensor to be initialized. */ -static enum ggml_status ggml_backend_cann_buffer_init_tensor( - ggml_backend_buffer_t buffer, ggml_tensor* tensor) { +static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { if (tensor->view_src != NULL && tensor->view_offs == 0) { GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); return GGML_STATUS_SUCCESS; @@ -1121,13 +1081,11 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( if (ggml_is_quantized(tensor->type)) { // Initialize padding to 0 to avoid possible NaN values size_t original_size = ggml_nbytes(tensor); - size_t padded_size = - ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size && tensor->view_src == nullptr) { size_t memset_size = padded_size - original_size; - ACL_CHECK(aclrtMemset((char*)tensor->data + original_size, - memset_size, 0, memset_size)); + ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size)); } } return GGML_STATUS_SUCCESS; @@ -1141,8 +1099,8 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( * designed to be used with a global array, one per device. */ struct ggml_cann_nz_workspace { - void* ptr; // Pointer to allocated device buffer - size_t allocated; // Size of currently allocated buffer in bytes + void * ptr; // Pointer to allocated device buffer + size_t allocated; // Size of currently allocated buffer in bytes /** * @brief Constructor. Initializes the workspace with no allocated memory. @@ -1158,7 +1116,7 @@ struct ggml_cann_nz_workspace { void clear() { if (ptr) { ACL_CHECK(aclrtFree(ptr)); - ptr = nullptr; + ptr = nullptr; allocated = 0; } } @@ -1185,7 +1143,7 @@ struct ggml_cann_nz_workspace { * * @return Pointer to the allocated buffer, or nullptr if not allocated. */ - void* get() const { return ptr; } + void * get() const { return ptr; } }; /** @@ -1207,19 +1165,17 @@ static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES]; * @note The workspace buffer used in this function is managed globally and reused * across calls. This reduces overhead from repeated memory allocation and deallocation. */ -static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) { - aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, - tensor->nb, 2, ACL_FORMAT_ND, offset); - uint64_t workspaceSize = 0; - aclOpExecutor *executor; +static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) { + aclTensor * weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset); + uint64_t workspaceSize = 0; + aclOpExecutor * executor; // TransMatmulWeight - ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, - &workspaceSize, &executor)); + ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); // Avoid frequent malloc/free of the workspace. g_nz_workspaces[device].realloc(workspaceSize); - void* g_nz_workspace = g_nz_workspaces[device].get(); + void * g_nz_workspace = g_nz_workspaces[device].get(); ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); @@ -1238,11 +1194,12 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) * @param offset Offset in the source data from where to start copying. * @param size Size of the data to be copied, in bytes. */ -static void ggml_backend_cann_buffer_set_tensor( - ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, - size_t offset, size_t size) { - ggml_backend_cann_buffer_context *ctx = - (ggml_backend_cann_buffer_context *)buffer->context; +static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; ggml_cann_set_device(ctx->device); // TODO: refer to cann(#6017), it use thread's default stream. @@ -1252,20 +1209,17 @@ static void ggml_backend_cann_buffer_set_tensor( // Only check env once. static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on")); if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, - ACL_MEMCPY_HOST_TO_DEVICE)); - if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { + ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE)); + if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) { GGML_ASSERT(tensor->ne[2] == 1); GGML_ASSERT(tensor->ne[3] == 1); weight_format_to_nz(tensor, offset, ctx->device); } } else { - void *transform_buffer = malloc(size); + void * transform_buffer = malloc(size); ggml_backend_cann_transform(tensor, data, transform_buffer); - ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, - transform_buffer, size, - ACL_MEMCPY_HOST_TO_DEVICE)); + ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE)); free(transform_buffer); } } @@ -1283,22 +1237,20 @@ static void ggml_backend_cann_buffer_set_tensor( * @param offset Offset in the destination buffer where to start copying. * @param size Size of the data to be copied, in bytes. */ -static void ggml_backend_cann_buffer_get_tensor( - ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, - size_t offset, size_t size) { - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; +static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; ggml_cann_set_device(ctx->device); if (!need_transform(tensor->type)) { - ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size, - ACL_MEMCPY_DEVICE_TO_HOST)); + ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST)); } else { - void* transform_buffer = malloc(size); - ACL_CHECK(aclrtMemcpy(transform_buffer, size, - (char*)tensor->data + offset, size, - ACL_MEMCPY_DEVICE_TO_HOST)); + void * transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST)); ggml_backend_cann_transform_back(tensor, transform_buffer, data); free(transform_buffer); } @@ -1317,19 +1269,17 @@ static void ggml_backend_cann_buffer_get_tensor( * @param dst Pointer to the destination tensor where the data will be copied. * @return true if the copy operation succeeded, false otherwise. */ -static bool ggml_backend_cann_buffer_cpy_tensor( - ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) { +static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor * src, + ggml_tensor * dst) { if (ggml_backend_buffer_is_cann(src->buffer)) { - ggml_backend_cann_buffer_context* src_ctx = - (ggml_backend_cann_buffer_context*)src->buffer->context; - ggml_backend_cann_buffer_context* dst_ctx = - (ggml_backend_cann_buffer_context*)buffer->context; + ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context; + ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context; size_t memcpy_size = ggml_nbytes(src); // Same device. if (src_ctx->device == dst_ctx->device) { - ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, - (const char*)src->data, memcpy_size, + ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE)); return true; } else { @@ -1339,13 +1289,11 @@ static bool ggml_backend_cann_buffer_cpy_tensor( #endif // Different device but can access by peer. int32_t canAccessPeer = 0; - ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, - dst_ctx->device)); + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device)); if (canAccessPeer) { ggml_cann_set_device(src_ctx->device); ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0)); - ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, - (const char*)src->data, memcpy_size, + ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE)); return true; } @@ -1363,10 +1311,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor( * @param buffer The CANN buffer to be cleared. * @param value The value to which each byte in the buffer will be set. */ -static void ggml_backend_cann_buffer_clear( - ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_cann_buffer_context* ctx = - (ggml_backend_cann_buffer_context*)buffer->context; +static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context; ggml_cann_set_device(ctx->device); ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size)); @@ -1396,9 +1342,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { * buffer type. */ struct ggml_backend_cann_buffer_type_context { - int32_t - device; /**< Device identifier associated with the buffer context. */ - std::string name; /**< Name associated with the buffer context. */ + int32_t device; /**< Device identifier associated with the buffer context. */ + std::string name; /**< Name associated with the buffer context. */ }; /** @@ -1410,10 +1355,8 @@ struct ggml_backend_cann_buffer_type_context { * @param buft Pointer to the buffer type context. * @return Const pointer to the C-style string containing the name. */ -static const char* ggml_backend_cann_buffer_type_name( - ggml_backend_buffer_type_t buft) { - ggml_backend_cann_buffer_type_context* buft_ctx = - (ggml_backend_cann_buffer_type_context*)buft->context; +static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context; return buft_ctx->name.c_str(); } @@ -1428,34 +1371,27 @@ static const char* ggml_backend_cann_buffer_type_name( * @param size Size in bytes of the buffer to allocate. * @return Pointer to the allocated buffer, or nullptr if allocation fails. */ -static ggml_backend_buffer_t -ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, - size_t size) { - ggml_backend_cann_buffer_type_context* buft_ctx = - (ggml_backend_cann_buffer_type_context*)buft->context; +static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context; ggml_cann_set_device(buft_ctx->device); const size_t alignment = 128; - size = GGML_PAD(size, alignment); + size = GGML_PAD(size, alignment); if (size == 0) { size = alignment; } - void* dev_ptr; + void * dev_ptr; aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); if (err != ACL_SUCCESS) { - GGML_LOG_ERROR( - "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", - __func__, size / 1024.0 / 1024.0, buft_ctx->device, - aclGetRecentErrMsg()); + GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__, + size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg()); return nullptr; } - ggml_backend_cann_buffer_context* ctx = - new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); + ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); - return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, - ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size); } /** @@ -1470,8 +1406,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, * @return The alignment requirement in bytes (fixed at 128 bytes for CANN * buffers). */ -static size_t ggml_backend_cann_buffer_type_get_alignment( - ggml_backend_buffer_type_t buft) { +static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; GGML_UNUSED(buft); @@ -1491,10 +1426,10 @@ static size_t ggml_backend_cann_buffer_type_get_alignment( * @return The total allocation size in bytes required for the tensor in the * CANN buffer. */ -static size_t ggml_backend_cann_buffer_type_get_alloc_size( - ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { - size_t size = ggml_nbytes(tensor); - int64_t ne0 = tensor->ne[0]; +static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, + const ggml_tensor * tensor) { + size_t size = ggml_nbytes(tensor); + int64_t ne0 = tensor->ne[0]; // Only check env once. static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on")); @@ -1507,19 +1442,17 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size( // size += (line_size_align_32 - line_size); if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size( - tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } - } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { + } else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) { // NZ format weight are not support quantized yet. // If ND tensor transform to NZ, size may changed. - int64_t shape[] = {tensor->ne[1], tensor->ne[0]}; + int64_t shape[] = { tensor->ne[1], tensor->ne[0] }; GGML_ASSERT(tensor->ne[2] == 1); GGML_ASSERT(tensor->ne[3] == 1); - const aclIntArray *acl_shape = aclCreateIntArray(shape, 2); - size_t new_size; - ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, - ggml_cann_type_mapping(tensor->type), &new_size)); + const aclIntArray * acl_shape = aclCreateIntArray(shape, 2); + size_t new_size; + ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size)); ACL_CHECK(aclDestroyIntArray(acl_shape)); size = std::max(size, new_size); } @@ -1560,17 +1493,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface * @return A pointer to the buffer type interface for the specified device, or * nullptr if the device index is out of range. */ -ggml_backend_buffer_type_t -ggml_backend_cann_buffer_type(int32_t device) { - static std::mutex mutex; +ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) { + static std::mutex mutex; std::lock_guard lock(mutex); if (device >= ggml_backend_cann_get_device_count()) { return nullptr; } - static ggml_backend_buffer_type - ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES]; + static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES]; static bool ggml_backend_cann_buffer_type_initialized = false; @@ -1580,8 +1511,7 @@ ggml_backend_cann_buffer_type(int32_t device) { /* .iface = */ ggml_backend_cann_buffer_type_interface, /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i), /* .context = */ - new ggml_backend_cann_buffer_type_context{ - i, "CANN" + std::to_string(i)}, + new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) }, }; } ggml_backend_cann_buffer_type_initialized = true; @@ -1645,16 +1575,16 @@ static void * ggml_cann_host_malloc(size_t size) { } const size_t alignment = 128; - size = GGML_PAD(size, alignment); + size = GGML_PAD(size, alignment); if (size == 0) { size = alignment; } - void * hostPtr = nullptr; - aclError err = aclrtMallocHost((void **) &hostPtr, size); + void * hostPtr = nullptr; + aclError err = aclrtMallocHost((void **) &hostPtr, size); if (err != ACL_SUCCESS) { - GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, - size / 1024.0 / 1024.0, aclGetRecentErrMsg()); + GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0, + aclGetRecentErrMsg()); return nullptr; } return hostPtr; @@ -1667,7 +1597,8 @@ static void * ggml_cann_host_malloc(size_t size) { * @param size Size in bytes of the host buffer to allocate. * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails. */ -static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { +static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { void * hostPtr = ggml_cann_host_malloc(size); if (hostPtr == nullptr) { @@ -1676,8 +1607,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm } ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; return buffer; } @@ -1691,14 +1622,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = { /* .iface = */ { - /* .get_name = */ ggml_backend_cann_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_name = */ ggml_backend_cann_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0), + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ + ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0), /* .context = */ nullptr, }; @@ -1718,8 +1650,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { * stored. * @return true if the computation was successful; false otherwise. */ -static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, - struct ggml_tensor* dst) { +static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) { switch (dst->op) { case GGML_OP_REPEAT: ggml_cann_repeat(ctx, dst); @@ -1765,14 +1696,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_UNARY_OP_SILU: GGML_CANN_CALL_OP_UNARY(Silu); break; - case GGML_UNARY_OP_GELU_QUICK: { - auto lambda = [](ggml_backend_cann_context& ctx, - aclTensor* acl_src, - aclTensor* acl_dst) { - GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst); - }; - ggml_cann_op_unary(lambda, ctx, dst); - } break; + case GGML_UNARY_OP_GELU_QUICK: + { + auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { + GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst); + }; + ggml_cann_op_unary(lambda, ctx, dst); + } + break; case GGML_UNARY_OP_TANH: GGML_CANN_CALL_OP_UNARY(Tanh); break; @@ -1817,14 +1748,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, case GGML_GLU_OP_SWIGLU: GGML_CANN_CALL_OP_UNARY_GATED(Silu); break; - case GGML_GLU_OP_GEGLU_QUICK: { - auto lambda = [](ggml_backend_cann_context& ctx, - aclTensor* acl_src, - aclTensor* acl_dst) { - GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst); - }; - ggml_cann_op_unary_gated(lambda, ctx, dst); - } break; + case GGML_GLU_OP_GEGLU_QUICK: + { + auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) { + GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst); + }; + ggml_cann_op_unary_gated(lambda, ctx, dst); + } + break; default: return false; } @@ -1956,9 +1887,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, * @param backend Pointer to the CANN backend structure. * @return A pointer to a constant string representing the backend name. */ -static const char* ggml_backend_cann_name(ggml_backend_t backend) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; +static const char * ggml_backend_cann_name(ggml_backend_t backend) { + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; return cann_ctx->name.c_str(); } @@ -1972,8 +1902,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) { * @param backend Pointer to the CANN backend structure to be freed. */ static void ggml_backend_cann_free(ggml_backend_t backend) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; ACL_CHECK(aclrtSynchronizeDevice()); ACL_CHECK(aclrtResetDevice(cann_ctx->device)); @@ -1981,7 +1910,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) { delete backend; } - /** * @brief Sets tensor data asynchronously in the CANN backend. * @@ -1994,21 +1922,17 @@ static void ggml_backend_cann_free(ggml_backend_t backend) { * @param size Size of the data to copy in bytes. */ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, - ggml_tensor *tensor, - const void *data, - size_t offset, - size_t size) { - ggml_backend_cann_context *cann_ctx = - (ggml_backend_cann_context *)backend->context; - ggml_backend_buffer_t buf = - tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - - GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && - "unsupported buffer type"); + ggml_tensor * tensor, + const void * data, + size_t offset, + size_t size) { + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type"); GGML_ASSERT(!ggml_is_quantized(tensor->type)); - ggml_cann_async_memcpy(cann_ctx, (char *)tensor->data + offset, data, size, - ACL_MEMCPY_HOST_TO_DEVICE); + ggml_cann_async_memcpy(cann_ctx, (char *) tensor->data + offset, data, size, ACL_MEMCPY_HOST_TO_DEVICE); } /** @@ -2022,21 +1946,18 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, * @param offset Offset in bytes within the host data. * @param size Size of the data to copy in bytes. */ -static void ggml_backend_cann_get_tensor_async( - ggml_backend_t backend, const ggml_tensor *tensor, void *data, - size_t offset, size_t size) { - ggml_backend_cann_context *cann_ctx = - (ggml_backend_cann_context *)backend->context; - ggml_backend_buffer_t buf = - tensor->view_src ? tensor->view_src->buffer : tensor->buffer; +static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend, + const ggml_tensor * tensor, + void * data, + size_t offset, + size_t size) { + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; - GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && - "unsupported buffer type"); + GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type"); GGML_ASSERT(!ggml_is_quantized(tensor->type)); - ggml_cann_async_memcpy(cann_ctx, data, (char *)tensor->data + offset, size, - ACL_MEMCPY_DEVICE_TO_HOST); - + ggml_cann_async_memcpy(cann_ctx, data, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST); } /** @@ -2052,28 +1973,23 @@ static void ggml_backend_cann_get_tensor_async( * @param dst Pointer to the destination tensor to copy data to. * @return true if the copy operation succeeds, false otherwise. */ -static bool ggml_backend_cann_cpy_tensor_async( - ggml_backend_t backend_src, ggml_backend_t backend_dst, - const ggml_tensor* src, ggml_tensor* dst) { - GGML_ASSERT(ggml_backend_is_cann(backend_src) || - ggml_backend_is_cann(backend_dst)); +static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src, + ggml_backend_t backend_dst, + const ggml_tensor * src, + ggml_tensor * dst) { + GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst)); - GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src)); + GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src)); - if (!ggml_backend_buffer_is_cann(src->buffer) || - !ggml_backend_buffer_is_cann(dst->buffer)) { + if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) { return false; } - ggml_backend_buffer_t buf_src = - src->view_src ? src->view_src->buffer : src->buffer; - ggml_backend_buffer_t buf_dst = - dst->view_src ? dst->view_src->buffer : dst->buffer; + ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; + ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; - ggml_backend_cann_context* cann_ctx_src = - (ggml_backend_cann_context*)backend_src->context; - ggml_backend_cann_context* cann_ctx_dst = - (ggml_backend_cann_context*)backend_dst->context; + ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context; + ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context; size_t copy_size = ggml_nbytes(dst); if (copy_size == 0) { @@ -2084,17 +2000,14 @@ static bool ggml_backend_cann_cpy_tensor_async( // TODO: Support 310p P2P copy return false; #endif - ggml_backend_cann_buffer_context* buf_ctx_src = - (ggml_backend_cann_buffer_context*)buf_src->context; - ggml_backend_cann_buffer_context* buf_ctx_dst = - (ggml_backend_cann_buffer_context*)buf_dst->context; + ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context; + ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context; GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device); GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device); int32_t canAccessPeer = 0; - ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, - cann_ctx_dst->device)); + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device)); if (!canAccessPeer) { return false; } @@ -2106,8 +2019,7 @@ static bool ggml_backend_cann_cpy_tensor_async( // wait for task_queue empty to keep task order. cann_ctx_src->task_queue.wait(); - ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, - ACL_MEMCPY_DEVICE_TO_DEVICE, + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, cann_ctx_src->stream())); // record event on src stream after the copy // TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream @@ -2122,8 +2034,7 @@ static bool ggml_backend_cann_cpy_tensor_async( ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream())); } else { // src and dst are on the same backend - ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, - ACL_MEMCPY_DEVICE_TO_DEVICE, + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, cann_ctx_dst->stream())); } @@ -2139,8 +2050,7 @@ static bool ggml_backend_cann_cpy_tensor_async( * @param backend Pointer to the CANN backend structure to synchronize. */ static void ggml_backend_cann_synchronize(ggml_backend_t backend) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; cann_ctx->task_queue.wait(); ggml_cann_set_device(cann_ctx->device); ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); @@ -2168,16 +2078,14 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) { * @param cann_ctx The CANN backend context containing the graph cache. * @param cgraph The current ggml computation graph. */ -static void add_lru_matched_graph_node_properties( - ggml_backend_cann_context * cann_ctx, - ggml_cgraph * cgraph) { +static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { // Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache). ggml_cann_graph * new_graph = new ggml_cann_graph(); new_graph->ggml_graph_properties.resize(cgraph->n_nodes); for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) { ggml_tensor * node = cgraph->nodes[node_idx]; - auto & prop = new_graph->ggml_graph_properties[node_idx]; + auto & prop = new_graph->ggml_graph_properties[node_idx]; prop.node_address = node->data; prop.node_op = node->op; @@ -2214,11 +2122,9 @@ static void add_lru_matched_graph_node_properties( * @param graph_node_properties The stored properties of a CANN graph node. * @return true if all fields match (excluding GGML_OP_VIEW); false otherwise. */ -static bool ggml_graph_node_has_matching_properties( - ggml_tensor * node, - ggml_graph_node_properties * graph_node_properties) { - if (node->data != graph_node_properties->node_address && - node->op != GGML_OP_VIEW) { +static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, + ggml_graph_node_properties * graph_node_properties) { + if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) { return false; } @@ -2237,8 +2143,7 @@ static bool ggml_graph_node_has_matching_properties( for (int i = 0; i < GGML_MAX_SRC; i++) { if (node->src[i]) { - if (node->src[i]->data != graph_node_properties->src_address[i] && - node->op != GGML_OP_VIEW) { + if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) { return false; } @@ -2280,8 +2185,8 @@ static bool ggml_graph_node_has_matching_properties( * @return true if a matching cached graph exists; false otherwise. */ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) { - ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache; - for (auto &graph_ptr : lru_cache.cache_list) { + ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache; + for (auto & graph_ptr : lru_cache.cache_list) { // Skip graphs with a different number of nodes. if (graph_ptr->ggml_graph_properties.size() != static_cast(cgraph->n_nodes)) { continue; @@ -2320,21 +2225,24 @@ static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * * @param use_cann_graph Whether to use CANN graph execution. * @param cann_graph_update_required Whether graph capture is needed due to graph changes. */ -static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph, - bool & use_cann_graph, bool & cann_graph_update_required) { +static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, + ggml_cgraph * cgraph, + bool & use_cann_graph, + bool & cann_graph_update_required) { #ifdef USE_ACL_GRAPH - ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); + ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front(); if (use_cann_graph && cann_graph_update_required) { ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL)); } -#endif // USE_ACL_GRAPH +#endif // USE_ACL_GRAPH // Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph. // With the use of CANN graphs, the execution will be performed by the graph launch. if (!use_cann_graph || cann_graph_update_required) { for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || + node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } @@ -2347,7 +2255,7 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx } #ifdef USE_ACL_GRAPH - if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture + if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph)); } @@ -2355,10 +2263,9 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx // Execute graph ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream())); } -#endif // USE_ACL_GRAPH +#endif // USE_ACL_GRAPH } - /** * @brief Computes a computational graph using a CANN backend. * @@ -2371,10 +2278,8 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation * completes successfully, otherwise an appropriate error status. */ -static enum ggml_status ggml_backend_cann_graph_compute( - ggml_backend_t backend, ggml_cgraph* cgraph) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; +static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; ggml_cann_set_device(cann_ctx->device); g_nz_workspaces[cann_ctx->device].clear(); @@ -2382,7 +2287,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( cann_ctx->rope_cache.cached = false; #ifdef USE_ACL_GRAPH - bool use_cann_graph = true; + bool use_cann_graph = true; bool cann_graph_update_required = false; static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or("")); @@ -2413,15 +2318,10 @@ static enum ggml_status ggml_backend_cann_graph_compute( } } #else - bool use_cann_graph = false; + bool use_cann_graph = false; bool cann_graph_update_required = false; #endif // USE_ACL_GRAPH - evaluate_and_capture_cann_graph( - cann_ctx, - cgraph, - use_cann_graph, - cann_graph_update_required - ); + evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required); return GGML_STATUS_SUCCESS; } @@ -2438,8 +2338,7 @@ static enum ggml_status ggml_backend_cann_graph_compute( * @return bool Returns true if the operation is supported by the backend, * otherwise false. */ -static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, - const ggml_tensor* op) { +static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -2474,24 +2373,24 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return false; } break; - case GGML_OP_MUL_MAT: { - switch (op->src[0]->type) { - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return true; - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: + case GGML_OP_MUL_MAT: + { + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return true; + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: #ifdef ASCEND_310P - // Q4 && Q8 per group is not support on 310p device - return false; + // Q4 && Q8 per group is not support on 310p device + return false; #endif - // only support contiguous for quantized types. - return ggml_is_contiguous(op->src[0]) && - ggml_is_contiguous(op->src[1]); - default: - return false; + // only support contiguous for quantized types. + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); + default: + return false; + } } - } case GGML_OP_MUL_MAT_ID: switch (op->src[0]->type) { case GGML_TYPE_F16: @@ -2504,99 +2403,107 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return false; #endif // only support contiguous for quantized types. - return ggml_is_contiguous(op->src[0]) && - ggml_is_contiguous(op->src[1]); + return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); default: return false; } // embedding - case GGML_OP_GET_ROWS: { - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - return true; - default: - return false; - } - } break; - case GGML_OP_SET_ROWS: { - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - return true; - default: - return false; + case GGML_OP_GET_ROWS: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } } - } break; - case GGML_OP_CPY: { - ggml_tensor *src = op->src[0]; - if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) || - (src->type != GGML_TYPE_F32 && - src->type != GGML_TYPE_F16)) { - // only support F32 and F16. - return false; + break; + case GGML_OP_SET_ROWS: + { + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + default: + return false; + } } - return true; - } break; - case GGML_OP_CONT: { - // TODO: support GGML_TYPE_BF16 - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - return true; - default: + break; + case GGML_OP_CPY: + { + ggml_tensor * src = op->src[0]; + if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) || + (src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) { + // only support F32 and F16. return false; + } + return true; } - } - case GGML_OP_ROPE: { - // TODO: with ops-test v == 1 - // TODO: n_dims <= ne0 - if (op->src[0]->ne[0] != op->op_params[1]) { - return false; + break; + case GGML_OP_CONT: + { + // TODO: support GGML_TYPE_BF16 + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + default: + return false; + } } + case GGML_OP_ROPE: + { + // TODO: with ops-test v == 1 + // TODO: n_dims <= ne0 + if (op->src[0]->ne[0] != op->op_params[1]) { + return false; + } - const int mode = ((const int32_t *) op->op_params)[2]; - if (mode & GGML_ROPE_TYPE_MROPE) { - return false; - } - if (mode & GGML_ROPE_TYPE_VISION) { - return false; - } + const int mode = ((const int32_t *) op->op_params)[2]; + if (mode & GGML_ROPE_TYPE_MROPE) { + return false; + } + if (mode & GGML_ROPE_TYPE_VISION) { + return false; + } #ifdef ASCEND_310P - if(!ggml_is_contiguous(op->src[0])){ - return false; - } + if (!ggml_is_contiguous(op->src[0])) { + return false; + } #endif - return true; - } - case GGML_OP_UPSCALE: { - // aclnnUpsampleNearest2dGetWorkspaceSize not support - // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal - if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) { - return false; + return true; } - if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) { - return false; + case GGML_OP_UPSCALE: + { + // aclnnUpsampleNearest2dGetWorkspaceSize not support + // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal + if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) { + return false; + } + if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) { + return false; + } + return true; } - return true; - } - case GGML_OP_POOL_2D: { - const int32_t * opts = (const int32_t *) op->op_params; + case GGML_OP_POOL_2D: + { + const int32_t * opts = (const int32_t *) op->op_params; #ifdef ASCEND_310P - enum ggml_op_pool opt = static_cast(opts[0]); - if(opt == GGML_OP_POOL_MAX){ - return false; - } + enum ggml_op_pool opt = static_cast(opts[0]); + if (opt == GGML_OP_POOL_MAX) { + return false; + } #endif - const int k0 = opts[1]; - const int k1 = opts[2]; - const int p0 = opts[5]; - const int p1 = opts[6]; - // value of paddingH should be at most half of kernelH - // value of paddingW should be at most half of kernelW - return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2)); - } + const int k0 = opts[1]; + const int k1 = opts[2]; + const int p0 = opts[5]; + const int p1 = opts[6]; + // value of paddingH should be at most half of kernelH + // value of paddingW should be at most half of kernelW + return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2)); + } case GGML_OP_DUP: case GGML_OP_SUM: case GGML_OP_IM2COL: @@ -2639,48 +2546,50 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, return (op->src[0]->ne[0] - 1) <= 255; case GGML_OP_SCALE: float bias; - memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float)); - return bias == 0.0f; // TODO: support bias != 0.0f + memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float)); + return bias == 0.0f; // TODO: support bias != 0.0f case GGML_OP_SOFT_MAX: // TODO: support attention sinks [TAG_ATTN_SINKS] if (op->src[2]) { return false; } return true; - case GGML_OP_FLASH_ATTN_EXT:{ + case GGML_OP_FLASH_ATTN_EXT: + { #ifdef ASCEND_310P - // FA not support on 310p device - return false; -#endif - // derived from [ggml-cuda.cu] - if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){ - return false; - } - if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){ - return false; - } - if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){ - return false; - } - // TODO: support attention sinks [TAG_ATTN_SINKS] - if (op->src[4]) { - return false; - } - if (op->src[1]->ne[0] != op->src[2]->ne[0]) { - // different head sizes of K and V are not supported yet - return false; - } - if (op->src[0]->ne[0] % 16 != 0) { - // TODO: padding to support - return false; - } - float logitSoftcap = 0.0f; - memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float)); - if(logitSoftcap != 0.0f) { + // FA not support on 310p device return false; +#endif + // derived from [ggml-cuda.cu] + if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) { + return false; + } + if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && + op->src[1]->type != GGML_TYPE_BF16) { + return false; + } + if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) { + return false; + } + // TODO: support attention sinks [TAG_ATTN_SINKS] + if (op->src[4]) { + return false; + } + if (op->src[1]->ne[0] != op->src[2]->ne[0]) { + // different head sizes of K and V are not supported yet + return false; + } + if (op->src[0]->ne[0] % 16 != 0) { + // TODO: padding to support + return false; + } + float logitSoftcap = 0.0f; + memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float)); + if (logitSoftcap != 0.0f) { + return false; + } + return true; } - return true; - } default: return false; } @@ -2717,8 +2626,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { * @return bool Returns true if the operation should be offloaded, otherwise * false. */ -static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, - const ggml_tensor* op) { +static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { const int min_batch_size = 32; GGML_UNUSED(dev); @@ -2734,9 +2642,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, * @param event Pointer to the event structure to be recorded. */ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; - ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; + ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream())); } /** @@ -2749,13 +2656,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_ * @param event Pointer to the event structure that the backend needs to wait * for. */ -static void ggml_backend_cann_event_wait(ggml_backend_t backend, - ggml_backend_event_t event) { - ggml_backend_cann_context* cann_ctx = - (ggml_backend_cann_context*)backend->context; +static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context; if (ggml_backend_is_cann(backend)) { - ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), - (aclrtEvent)event->context)); + ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context)); } else { GGML_ABORT("fatal error"); } @@ -2794,30 +2698,30 @@ static const ggml_backend_i ggml_backend_cann_interface = { * @return A pointer to the static GUID. */ static ggml_guid_t ggml_backend_cann_guid() { - static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34, - 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64}; + static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34, + 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 }; return &guid; } // backend device struct ggml_backend_cann_device_context { - int device; + int device; std::string name; std::string description; }; static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context; return ctx->name.c_str(); } -static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; +static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context; return ctx->description.c_str(); } static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context; ggml_backend_cann_get_device_memory(ctx->device, free, total); } @@ -2844,7 +2748,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { GGML_UNUSED(params); - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context; return ggml_backend_cann_init(ctx->device); } @@ -2861,19 +2765,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons * @return bool Returns true if the CANN backend supports the buffer type, * otherwise false. */ -static bool ggml_backend_cann_supports_buft( - ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { +static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (ggml_backend_buft_is_cann(buft)) { - ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; - ggml_backend_cann_buffer_type_context * buft_ctx = - (ggml_backend_cann_buffer_type_context *)buft->context; + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context; + ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context; return buft_ctx->device == dev_ctx->device; } return false; } static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context; return ggml_backend_cann_buffer_type(ctx->device); } @@ -2892,9 +2794,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type( * @param backend Pointer to the CANN backend. * @return ggml_backend_event_t Returns a pointer to the new event structure. */ -static ggml_backend_event_t ggml_backend_cann_device_event_new( - ggml_backend_dev_t dev) { - ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; +static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context; ggml_cann_set_device(dev_ctx->device); @@ -2916,7 +2817,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new( * @param event Pointer to the event structure to be freed. */ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { - ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context)); + ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context)); delete event; GGML_UNUSED(dev); @@ -2930,7 +2831,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac * @param event Pointer to the event structure to be synchronized. */ static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { - ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context)); + ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context)); GGML_UNUSED(dev); } @@ -2941,10 +2842,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = { /* .get_memory = */ ggml_backend_cann_device_get_memory, /* .get_type = */ ggml_backend_cann_device_get_type, /* .get_props = */ ggml_backend_cann_device_get_props, - /* .init_backend = */ ggml_backend_cann_device_init, // called for every card + /* .init_backend = */ ggml_backend_cann_device_init, // called for every card /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type, /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type, - /* .buffer_from_host_ptr = */ NULL, // not supported for CANN + /* .buffer_from_host_ptr = */ NULL, // not supported for CANN /* .supports_op = */ ggml_backend_cann_supports_op, /* .supports_buft = */ ggml_backend_cann_supports_buft, /* .offload_op = */ ggml_backend_cann_offload_op, @@ -2953,7 +2854,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = { /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize, }; - // backend reg struct ggml_backend_cann_reg_context { std::vector devices; @@ -2965,12 +2865,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) { } static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) { - ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context; + ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context; return ctx->devices.size(); } static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) { - ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context; + ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context; GGML_ASSERT(index < ctx->devices.size()); return ctx->devices[index]; } @@ -2992,34 +2892,30 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = { // backend registry, called only once for cann backend ggml_backend_reg_t ggml_backend_cann_reg() { static ggml_backend_reg reg; - static bool initialized = false; + static bool initialized = false; { - static std::mutex mutex; + static std::mutex mutex; std::lock_guard lock(mutex); if (!initialized) { aclInit(nullptr); ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; for (int i = 0; i < ggml_cann_info().device_count; i++) { - ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context(); - dev_ctx->description = aclrtGetSocName(); - dev_ctx->device = i; - dev_ctx->name = GGML_CANN_NAME + std::to_string(i); + ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context(); + dev_ctx->description = aclrtGetSocName(); + dev_ctx->device = i; + dev_ctx->name = GGML_CANN_NAME + std::to_string(i); ggml_cann_set_device(i); - ggml_backend_dev_t dev = new ggml_backend_device { - /* .iface = */ ggml_backend_cann_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx - }; + ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } - reg = ggml_backend_reg { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_cann_reg_interface, - /* .context = */ ctx - }; + reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx }; } initialized = true; @@ -3035,39 +2931,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) { return nullptr; } - ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); + ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device); if (ctx == nullptr) { GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); return nullptr; } ggml_cann_set_device(ctx->device); ggml_backend_t cann_backend = - new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), - /* .interface = */ ggml_backend_cann_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), - /* .context = */ ctx}; + new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(), + /* .interface = */ ggml_backend_cann_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), + /* .context = */ ctx }; return cann_backend; } bool ggml_backend_is_cann(ggml_backend_t backend) { - return backend != NULL && - ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); + return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); } int32_t ggml_backend_cann_get_device_count() { return ggml_cann_info().device_count; } -void ggml_backend_cann_get_device_description( - int32_t device, char* description, size_t description_size) { +void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) { ggml_cann_set_device(device); - const char* soc_name = aclrtGetSocName(); + const char * soc_name = aclrtGetSocName(); snprintf(description, description_size, "%s", soc_name); } -void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, - size_t* total) { +void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) { ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); }