From 2d72bd94b065ae5f72991edbac1cf9dc26591353 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Thu, 30 Jan 2025 19:46:34 +0530 Subject: [PATCH 01/40] SYCL: remove ggml_sycl_op_flatten function --- ggml/src/ggml-sycl/common.cpp | 34 -- ggml/src/ggml-sycl/common.hpp | 15 +- ggml/src/ggml-sycl/element_wise.cpp | 490 ++++++++++++---------------- ggml/src/ggml-sycl/ggml-sycl.cpp | 418 ++++++++++-------------- ggml/src/ggml-sycl/im2col.cpp | 38 +-- ggml/src/ggml-sycl/im2col.hpp | 5 +- ggml/src/ggml-sycl/norm.cpp | 58 ++-- ggml/src/ggml-sycl/norm.hpp | 17 +- ggml/src/ggml-sycl/rope.cpp | 63 ++-- ggml/src/ggml-sycl/rope.hpp | 4 +- ggml/src/ggml-sycl/tsembd.cpp | 6 +- ggml/src/ggml-sycl/wkv6.cpp | 5 - 12 files changed, 465 insertions(+), 688 deletions(-) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 022e7b7637bd3..9260a58c26278 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -65,37 +65,3 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block } return sycl_down_blk_size; } - -void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const ggml_sycl_op_flatten_t op) try { - - const bool use_src1 = src1 != nullptr; - if(use_src1) - GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - - // dd = data device - float * src0_ddf = (float *) src0->data; - float * src1_ddf = use_src1 ? (float *) src1->data : nullptr; - float * dst_ddf = (float *) dst->data; - - ggml_sycl_pool_alloc src0_f(ctx.pool()); - ggml_sycl_pool_alloc src1_f(ctx.pool()); - ggml_sycl_pool_alloc dst_f(ctx.pool()); - - ggml_sycl_set_device(ctx.device); - queue_ptr main_stream = ctx.stream(); - // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n", - // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device); - - // do the computation - op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); - // print_ggml_tensor("tensor", dst); -} -catch (sycl::exception const &exc) { - - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index abad847ca8199..4bf875c9a08e7 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -677,8 +677,17 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t bool gpu_has_xmx(sycl::device &dev); -void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const ggml_sycl_op_flatten_t op); +// Some backend specific macros +#define GGML_SYCL_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, dst->src[0], ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, dst->src[0], nb) GGML_TENSOR_LOCALS(int64_t, ne1, dst->src[1], ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, dst->src[1], nb) GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 4bcd74376eaac..6d68ea0779a49 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,5 +1,6 @@ #include "common.hpp" #include "element_wise.hpp" +#include "ggml.h" void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12, @@ -509,497 +510,410 @@ void pad_f32_sycl(const float *x, float *dst, const int ne00, }); } -inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); +inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + log_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + step_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); float negative_slope; memcpy(&negative_slope, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); + dpct::queue_ptr main_stream = ctx.stream(); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream); } -inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } -inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - const float sf0 = (float)dst->ne[0]/src0->ne[0]; - const float sf1 = (float)dst->ne[1]/src0->ne[1]; - const float sf2 = (float)dst->ne[2]/src0->ne[2]; - const float sf3 = (float)dst->ne[3]/src0->ne[3]; + const float sf0 = (float)dst->ne[0]/dst->src[0]->ne[0]; + const float sf1 = (float)dst->ne[1]/dst->src[0]->ne[1]; + const float sf2 = (float)dst->ne[2]/dst->src[0]->ne[2]; + const float sf3 = (float)dst->ne[3]/dst->src[0]->ne[3]; + + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); - upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + upscale_f32_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); pad_f32_sycl(src0_dd, dst_dd, - src0->ne[0], src0->ne[1], src0->ne[2], + dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, + ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused int offset = dst->op_params[3] / 4; // offset in bytes - acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); + acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream); } -inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, + ggml_tensor *dst) { + // TODO: remove duplicate variables + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } -inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + // TODO: remove duplicate variables + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } -inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + // TODO: remove duplicate variables + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } -inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + // TODO: remove duplicate variables + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt); + ggml_sycl_op_sqrt(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin); + ggml_sycl_op_sin(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos); + ggml_sycl_op_cos(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc); + ggml_sycl_op_acc(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu); + ggml_sycl_op_gelu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu); + ggml_sycl_op_silu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick); + ggml_sycl_op_gelu_quick(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh); + ggml_sycl_op_tanh(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu); + ggml_sycl_op_relu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid); + ggml_sycl_op_sigmoid(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid); + ggml_sycl_op_hardsigmoid(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish); + ggml_sycl_op_hardswish(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp); + ggml_sycl_op_exp(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log); + ggml_sycl_op_log(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg); + ggml_sycl_op_neg(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step); + ggml_sycl_op_step(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu); + ggml_sycl_op_leaky_relu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr); + ggml_sycl_op_sqr(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale); + ggml_sycl_op_upscale(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad); + ggml_sycl_op_pad(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } @@ -1007,24 +921,24 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add); + ggml_sycl_op_add(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub); + ggml_sycl_op_sub(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul); + ggml_sycl_op_mul(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div); + ggml_sycl_op_div(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 2984ed82e8a7c..91c244579ff07 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1897,12 +1897,9 @@ static void pool2d_nchw_kernel( } template -static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const void *src0_dd, - const int32_t *src1_dd, float *dst_dd, - queue_ptr stream) { +static void get_rows_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_TENSOR_BINARY_OP_LOCALS + GGML_SYCL_TENSOR_BINARY_OP_LOCALS const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE); @@ -1914,12 +1911,17 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr const size_t s2 = nb2 / ggml_element_size(dst); const size_t s3 = nb3 / ggml_element_size(dst); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / ggml_element_size(dst->src[1]); + const size_t s11 = nb11 / ggml_element_size(dst->src[1]); + const size_t s12 = nb12 / ggml_element_size(dst->src[1]); + //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); GGML_ASSERT(ne00 % 2 == 0); + const void * src0_dd = dst->src[0]->data; + const int32_t * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr stream = ctx.stream(); stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { @@ -1928,17 +1930,12 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *sr s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); }); - GGML_UNUSED(dst); - GGML_UNUSED(ctx); } template -static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const src0_t *src0_dd, const int32_t *src1_dd, - float *dst_dd, queue_ptr stream) { +static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_TENSOR_BINARY_OP_LOCALS + GGML_SYCL_TENSOR_BINARY_OP_LOCALS const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; @@ -1950,10 +1947,15 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens const size_t s2 = nb2 / ggml_element_size(dst); const size_t s3 = nb3 / ggml_element_size(dst); - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); + const size_t s10 = nb10 / ggml_element_size(dst->src[1]); + const size_t s11 = nb11 / ggml_element_size(dst->src[1]); + const size_t s12 = nb12 / ggml_element_size(dst->src[1]); + //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); + const src0_t * src0_dd = static_cast(dst->src[0]->data); + const int32_t * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr stream = ctx.stream(); { dpct::has_capability_or_fail(stream->get_device(), @@ -1966,9 +1968,6 @@ static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tens s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); }); } - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); } static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, @@ -2494,62 +2493,53 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_d, const float *src1_d, - float *dst_d, const queue_ptr &stream) { +static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); + GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - const int32_t * src1_i32 = (const int32_t *) src1_d; - - switch (src0->type) { + switch (dst->src[0]->type) { case GGML_TYPE_F16: - get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d, - src1_i32, dst_d, stream); + get_rows_sycl_float(ctx, dst); break; case GGML_TYPE_F32: - get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_sycl_float(ctx, dst); break; case GGML_TYPE_Q4_0: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_sycl(ctx, dst); break; case GGML_TYPE_Q4_1: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_sycl(ctx, dst); break; case GGML_TYPE_Q5_0: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_sycl(ctx, dst); break; case GGML_TYPE_Q5_1: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_sycl(ctx, dst); break; case GGML_TYPE_Q8_0: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); + get_rows_sycl(ctx, dst); break; default: // TODO: k-quants - GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); + GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type)); GGML_ABORT("fatal error"); break; } } -static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_d, const float *src1_d, - float *dst_d, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream); +static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + // TODO: remove duplicate variables + const float * src0_d = static_cast(dst->src[0]->data); + float * dst_d = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); - GGML_UNUSED(src1); - GGML_UNUSED(src1_d); + ggml_sycl_op_bin_bcast>(ctx, dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); } @@ -2685,13 +2675,10 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = static_cast(opts[0]); @@ -2702,8 +2689,8 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens const int p0 = opts[5]; const int p1 = opts[6]; - const int64_t IH = src0->ne[1]; - const int64_t IW = src0->ne[0]; + const int64_t IH = dst->src[0]->ne[1]; + const int64_t IW = dst->src[0]->ne[0]; const int64_t N = dst->ne[3]; const int64_t OC = dst->ne[2]; @@ -2712,7 +2699,10 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens const int parallel_elements = N * OC * OH * OW; const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; - sycl::range<3> block_nums(1, 1, num_blocks); + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + sycl::range<3> block_nums(1, 1, num_blocks); main_stream->parallel_for( sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), @@ -2722,163 +2712,122 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tens parallel_elements, src0_dd, dst_dd, op, item_ct1); }); - - GGML_UNUSED(src1); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ne = ggml_nelements(src0); + const int64_t ne = ggml_nelements(dst->src[0]); + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_I32); - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + int32_t * dst_dd = static_cast(dst->data); - argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + argsort_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, order, main_stream); } -inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_I32); - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); - argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream); + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + int32_t * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); } -inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int nrows0 = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t ne01 = dst->src[0]->ne[1]; + const int nrows0 = ggml_nrows(dst->src[0]); const int n_past = ((int32_t *) dst->op_params)[0]; + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); float scale; memcpy(&scale, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); + dpct::queue_ptr main_stream = ctx.stream(); + + scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream); /* DPCT1010:87: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ SYCL_CHECK(0); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } -inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); float min; float max; memcpy(&min, dst->op_params, sizeof(float)); memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); + clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream); /* DPCT1010:88: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code. */ SYCL_CHECK(0); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { @@ -3247,33 +3196,21 @@ catch (sycl::exception const &exc) { } -static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_repeat); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_get_rows); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_norm); + ggml_sycl_op_norm(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rms_norm); + ggml_sycl_op_rms_norm(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm); + ggml_sycl_op_group_norm(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } @@ -3646,7 +3583,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous( } static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, - ggml_tensor *dst) try { + ggml_tensor * dst) try { const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers"); @@ -3815,22 +3752,21 @@ catch (sycl::exception const &exc) { } static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_scale); + ggml_sycl_op_scale(ctx, dst); } static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp); + ggml_sycl_op_clamp(ctx, dst); } -static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) try { +static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - GGML_TENSOR_BINARY_OP_LOCALS01; + GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS; SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); @@ -3861,7 +3797,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr ggml_type_name(src0->type), ggml_type_name(src1->type)); GGML_ABORT("fatal error"); } - GGML_UNUSED(dst); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -3871,44 +3806,39 @@ catch (sycl::exception const &exc) { static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { // TODO: why do we pass dst as src1 here? - ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr); + ggml_sycl_cpy(ctx, dst->src[0], dst); } static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf); + ggml_sycl_op_diag_mask_inf(ctx, dst); } static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope); + ggml_sycl_op_rope(ctx, dst); } static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pool2d); + ggml_sycl_op_pool2d(ctx, dst); } static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_im2col); + ggml_sycl_op_im2col(ctx, dst); } static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum); + ggml_sycl_op_sum(ctx, dst); } static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum_rows); + ggml_sycl_op_sum_rows(ctx, dst); } static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argsort); -} - -static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argmax); + ggml_sycl_op_argsort(ctx, dst); } @@ -3942,138 +3872,138 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens switch (dst->op) { case GGML_OP_ARGMAX: - ggml_sycl_argmax(ctx, dst); + ggml_sycl_op_argmax(ctx, dst); // done break; case GGML_OP_CONV_TRANSPOSE_1D: - ggml_sycl_op_conv_transpose_1d(ctx, dst); + ggml_sycl_op_conv_transpose_1d(ctx, dst); // already good break; case GGML_OP_REPEAT: - ggml_sycl_repeat(ctx, dst); + ggml_sycl_op_repeat(ctx, dst); // partially done break; case GGML_OP_GET_ROWS: - ggml_sycl_get_rows(ctx, dst); + ggml_sycl_op_get_rows(ctx, dst); // done break; case GGML_OP_DUP: - ggml_sycl_dup(ctx, dst); + ggml_sycl_dup(ctx, dst); // done break; case GGML_OP_ADD: case GGML_OP_ADD1: // TODO: more efficient implementation - ggml_sycl_add(ctx, dst); + ggml_sycl_add(ctx, dst); // partially done break; case GGML_OP_SUB: - ggml_sycl_sub(ctx, dst); + ggml_sycl_sub(ctx, dst); // partially done break; case GGML_OP_ACC: - ggml_sycl_acc(ctx, dst); + ggml_sycl_acc(ctx, dst); // fully done break; case GGML_OP_MUL: - ggml_sycl_mul(ctx, dst); + ggml_sycl_mul(ctx, dst); // partially done break; case GGML_OP_LOG: - ggml_sycl_log(ctx, dst); + ggml_sycl_log(ctx, dst); // fully done break; case GGML_OP_DIV: - ggml_sycl_div(ctx, dst); + ggml_sycl_div(ctx, dst); // partially done break; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_NEG: - ggml_sycl_neg(ctx, dst); + ggml_sycl_neg(ctx, dst); // done break; case GGML_UNARY_OP_STEP: - ggml_sycl_step(ctx, dst); + ggml_sycl_step(ctx, dst); // done break; case GGML_UNARY_OP_GELU: - ggml_sycl_gelu(ctx, dst); + ggml_sycl_gelu(ctx, dst); // done break; case GGML_UNARY_OP_SILU: - ggml_sycl_silu(ctx, dst); + ggml_sycl_silu(ctx, dst); // done break; case GGML_UNARY_OP_GELU_QUICK: - ggml_sycl_gelu_quick(ctx, dst); + ggml_sycl_gelu_quick(ctx, dst); // done break; case GGML_UNARY_OP_TANH: - ggml_sycl_tanh(ctx, dst); + ggml_sycl_tanh(ctx, dst); // done break; case GGML_UNARY_OP_RELU: - ggml_sycl_relu(ctx, dst); + ggml_sycl_relu(ctx, dst); // done break; case GGML_UNARY_OP_SIGMOID: - ggml_sycl_sigmoid(ctx, dst); + ggml_sycl_sigmoid(ctx, dst); // done break; case GGML_UNARY_OP_HARDSIGMOID: - ggml_sycl_hardsigmoid(ctx, dst); + ggml_sycl_hardsigmoid(ctx, dst); // done break; case GGML_UNARY_OP_HARDSWISH: - ggml_sycl_hardswish(ctx, dst); + ggml_sycl_hardswish(ctx, dst); // done break; case GGML_UNARY_OP_EXP: - ggml_sycl_exp(ctx, dst); + ggml_sycl_exp(ctx, dst); // done break; default: return false; } break; case GGML_OP_NORM: - ggml_sycl_norm(ctx, dst); + ggml_sycl_norm(ctx, dst); // done break; case GGML_OP_GROUP_NORM: - ggml_sycl_group_norm(ctx, dst); + ggml_sycl_group_norm(ctx, dst); // done break; case GGML_OP_CONCAT: - ggml_sycl_op_concat(ctx, dst); + ggml_sycl_op_concat(ctx, dst); // already good break; case GGML_OP_UPSCALE: - ggml_sycl_upscale(ctx, dst); + ggml_sycl_upscale(ctx, dst); // done break; case GGML_OP_PAD: - ggml_sycl_pad(ctx, dst); + ggml_sycl_pad(ctx, dst); // done break; case GGML_OP_LEAKY_RELU: - ggml_sycl_leaky_relu(ctx, dst); + ggml_sycl_leaky_relu(ctx, dst); // done break; case GGML_OP_RMS_NORM: - ggml_sycl_rms_norm(ctx, dst); + ggml_sycl_rms_norm(ctx, dst); // done break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; } /* ggml_sycl_mul_mat_id is dependent on ggml_sycl_mul_mat */ - ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst); + ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst); // good break; case GGML_OP_MUL_MAT_ID: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; } - ggml_sycl_mul_mat_id(ctx, dst); + ggml_sycl_mul_mat_id(ctx, dst); // good break; case GGML_OP_OUT_PROD: - ggml_sycl_op_out_prod(ctx, dst); + ggml_sycl_op_out_prod(ctx, dst); // good break; case GGML_OP_SCALE: - ggml_sycl_scale(ctx, dst); + ggml_sycl_scale(ctx, dst); // done break; case GGML_OP_SQR: - ggml_sycl_sqr(ctx, dst); + ggml_sycl_sqr(ctx, dst); // done break; case GGML_OP_SQRT: - ggml_sycl_sqrt(ctx, dst); + ggml_sycl_sqrt(ctx, dst); // done break; case GGML_OP_SIN: - ggml_sycl_sin(ctx, dst); + ggml_sycl_sin(ctx, dst); //done break; case GGML_OP_COS: - ggml_sycl_cos(ctx, dst); + ggml_sycl_cos(ctx, dst); // done break; case GGML_OP_CLAMP: - ggml_sycl_clamp(ctx, dst); + ggml_sycl_clamp(ctx, dst); // done break; case GGML_OP_CPY: - ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst); + ggml_sycl_cpy(ctx, dst->src[0], dst->src[1]); // okayish, need check break; case GGML_OP_CONT: - ggml_sycl_dup(ctx, dst); + ggml_sycl_dup(ctx, dst); // done break; case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -4083,34 +4013,34 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__); break; case GGML_OP_DIAG_MASK_INF: - ggml_sycl_diag_mask_inf(ctx, dst); + ggml_sycl_diag_mask_inf(ctx, dst); // done break; case GGML_OP_SOFT_MAX: - ggml_sycl_op_soft_max(ctx, dst); + ggml_sycl_op_soft_max(ctx, dst); // already good break; case GGML_OP_ROPE: - ggml_sycl_rope(ctx, dst); + ggml_sycl_rope(ctx, dst); // done break; case GGML_OP_IM2COL: - ggml_sycl_im2col(ctx, dst); + ggml_sycl_im2col(ctx, dst); // done break; case GGML_OP_POOL_2D: - ggml_sycl_pool2d(ctx, dst); + ggml_sycl_pool2d(ctx, dst); // done break; case GGML_OP_SUM: - ggml_sycl_sum(ctx, dst); + ggml_sycl_sum(ctx, dst); // done break; case GGML_OP_SUM_ROWS: - ggml_sycl_sum_rows(ctx, dst); + ggml_sycl_sum_rows(ctx, dst); // done break; case GGML_OP_ARGSORT: - ggml_sycl_argsort(ctx, dst); + ggml_sycl_argsort(ctx, dst); // done break; case GGML_OP_TIMESTEP_EMBEDDING: - ggml_sycl_op_timestep_embedding(ctx, dst); + ggml_sycl_op_timestep_embedding(ctx, dst); // already pretty good break; case GGML_OP_RWKV_WKV6: - ggml_sycl_op_rwkv_wkv6(ctx, dst); + ggml_sycl_op_rwkv_wkv6(ctx, dst); // good break; case GGML_OP_GATED_LINEAR_ATTN: ggml_sycl_op_gated_linear_attn(ctx, dst); diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index 6146a99edbe77..4da9d12d8e5a4 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -82,13 +82,10 @@ static void im2col_sycl( } } -void ggml_sycl_op_im2col( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -100,27 +97,28 @@ void ggml_sycl_op_im2col( const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - const int64_t IC = src1->ne[is_2D ? 2 : 1]; - const int64_t IH = is_2D ? src1->ne[1] : 1; - const int64_t IW = src1->ne[0]; + const int64_t IC = dst->src[1]->ne[is_2D ? 2 : 1]; + const int64_t IH = is_2D ? dst->src[1]->ne[1] : 1; + const int64_t IW = dst->src[1]->ne[0]; - const int64_t KH = is_2D ? src0->ne[1] : 1; - const int64_t KW = src0->ne[0]; + const int64_t KH = is_2D ? dst->src[0]->ne[1] : 1; + const int64_t KW = dst->src[0]->ne[0]; const int64_t OH = is_2D ? dst->ne[2] : 1; const int64_t OW = dst->ne[1]; - const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - const int64_t batch = src1->ne[3]; - const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + const size_t delta_offset = dst->src[1]->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const int64_t batch = dst->src[1]->ne[3]; + const size_t batch_offset = dst->src[1]->nb[3] / 4; // nb is byte offset, src is type float32 + dpct::queue_ptr main_stream = ctx.stream(); if (dst->type == GGML_TYPE_F16) { - im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + const float * src1_dd = static_cast(dst->src[1]->data); + sycl::half * dst_dd = static_cast(dst->data); + im2col_sycl(src1_dd, dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); } else { - im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + im2col_sycl(src1_dd, dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); } - - GGML_UNUSED(src0); - GGML_UNUSED(src0_dd); - GGML_UNUSED(ctx); } diff --git a/ggml/src/ggml-sycl/im2col.hpp b/ggml/src/ggml-sycl/im2col.hpp index 7db144fbbe524..4474c7b7b9157 100644 --- a/ggml/src/ggml-sycl/im2col.hpp +++ b/ggml/src/ggml-sycl/im2col.hpp @@ -15,9 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_im2col( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream); +void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_IM2COL_HPP diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 9cf2be15575d8..628bdfa4dbc47 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -311,34 +311,27 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, } } -void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst, const float* src0_dd, - const float* src1_dd, float* dst_dd, - const queue_ptr& main_stream) { +void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); float eps; memcpy(&eps, dst->op_params, sizeof(float)); - norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); - (void)src1; - (void)dst; - (void)src1_dd; + norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); } -void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream) { +void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); int num_groups = dst->op_params[0]; @@ -346,33 +339,26 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* float eps; memcpy(&eps, dst->op_params + 1, sizeof(float)); - int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); - - (void)src1; - (void)dst; - (void)src1_dd; - GGML_UNUSED(ctx); + int group_size = dst->src[0]->ne[0] * dst->src[0]->ne[1] * ((dst->src[0]->ne[2] + num_groups - 1) / num_groups); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, dst->src[0]->ne[0] * dst->src[0]->ne[1] * dst->src[0]->ne[2], main_stream, ctx.device); } -void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream) { +void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); float eps; memcpy(&eps, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); - - (void)src1; - (void)dst; - (void)src1_dd; } diff --git a/ggml/src/ggml-sycl/norm.hpp b/ggml/src/ggml-sycl/norm.hpp index a9ad9156fa33e..e733de5c23c81 100644 --- a/ggml/src/ggml-sycl/norm.hpp +++ b/ggml/src/ggml-sycl/norm.hpp @@ -15,21 +15,10 @@ #include "common.hpp" -void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst, const float* src0_dd, - const float* src1_dd, float* dst_dd, - const queue_ptr& main_stream); +void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor * dst); -void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream); +void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, ggml_tensor * dst); -void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream); +void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor * dst); #endif // GGML_SYCL_NORM_HPP diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 1244b231af738..2a6c3ca7554da 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -192,18 +192,15 @@ static void rope_neox_sycl( } } -void ggml_sycl_op_rope( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) { - const ggml_tensor * src2 = dst->src[2]; +void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); - GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->src[0]->type == dst->type); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t nr = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t ne01 = dst->src[0]->ne[1]; + const int64_t nr = ggml_nrows(dst->src[0]); //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -228,49 +225,49 @@ void ggml_sycl_op_rope( const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const int32_t * pos = (const int32_t *) src1_dd; - + const int32_t * pos = static_cast(dst->src[1]->data); const float * freq_factors = nullptr; - if (src2 != nullptr) { - freq_factors = (const float *) src2->data; + if (dst->src[2] != nullptr) { + freq_factors = static_cast(dst->src[2]->data); } rope_corr_dims corr_dims; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); + dpct::queue_ptr main_stream = ctx.stream(); // compute if (is_neox) { - if (src0->type == GGML_TYPE_F32) { - rope_neox_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_neox_sycl( - (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, main_stream - ); + if (dst->src[0]->type == GGML_TYPE_F32) { + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + rope_neox_sycl(src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream); + } else if (dst->src[0]->type == GGML_TYPE_F16) { + const sycl::half * src0_dd = static_cast(dst->src[0]->data); + sycl::half * dst_dd = static_cast(dst->data); + rope_neox_sycl(src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream); } else { GGML_ABORT("fatal error"); } } else { - if (src0->type == GGML_TYPE_F32) { + if (dst->src[0]->type == GGML_TYPE_F32) { + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); rope_norm_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream ); - } else if (src0->type == GGML_TYPE_F16) { + } else if (dst->src[0]->type == GGML_TYPE_F16) { + const sycl::half * src0_dd = static_cast(dst->src[0]->data); + sycl::half * dst_dd = static_cast(dst->data); rope_norm_sycl( - (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream ); } else { GGML_ABORT("fatal error"); } } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); } diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp index 00354c3131bd7..dd15ac6d8967f 100644 --- a/ggml/src/ggml-sycl/rope.hpp +++ b/ggml/src/ggml-sycl/rope.hpp @@ -15,8 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_rope( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); +void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ROPE_HPP diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index b877d18c1730a..9de324c3a14c4 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -57,9 +57,8 @@ static void timestep_embedding_f32_sycl( void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = static_cast(src0->data); + float * dst_d = static_cast(dst->data); dpct::queue_ptr stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -69,5 +68,4 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso const int max_period = dst->op_params[1]; timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); - GGML_UNUSED(src1); } diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp index b54c20964ed5d..e3ea568c5f5e7 100644 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -97,9 +97,6 @@ static void rwkv_wkv_f32_kernel( void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - const float* k_d = (const float*)dst->src[0]->data; const float* v_d = (const float*)dst->src[1]->data; const float* r_d = (const float*)dst->src[2]->data; @@ -138,6 +135,4 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { }); }); - GGML_UNUSED(src0); - GGML_UNUSED(src1); } From 957c11b2cf0926997e028eebe6020d64a301283b Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Fri, 31 Jan 2025 18:30:29 +0530 Subject: [PATCH 02/40] binbcast: use void pointer to prevent intermediate type conversions --- ggml/src/ggml-sycl/common.hpp | 20 ++++++++-------- ggml/src/ggml-sycl/element_wise.cpp | 36 +++++++++++++---------------- ggml/src/ggml-sycl/ggml-sycl.cpp | 7 +++--- 3 files changed, 28 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 4bf875c9a08e7..ae27787845d03 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -508,8 +508,7 @@ static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t template struct bin_bcast_sycl { template - void operator()(ggml_backend_sycl_context & ctx, - const struct ggml_tensor *src0, + void operator()(const struct ggml_tensor *src0, const struct ggml_tensor *src1, struct ggml_tensor *dst, const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, queue_ptr stream) { @@ -643,30 +642,29 @@ struct bin_bcast_sycl { }); } } - GGML_UNUSED(ctx); } }; template -inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, +inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, + const void *src0_dd, const void *src1_dd, + void *dst_dd, const queue_ptr &main_stream) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, + op()(src0, src1, dst, (const sycl::half *)src0_dd, (const float *)src1_dd, (sycl::half *)dst_dd, main_stream); } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, + op()(src0, src1, dst, (const sycl::half *)src0_dd, (const float *)src1_dd, (float *)dst_dd, main_stream); } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { - op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, + op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, main_stream); } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { - op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, + op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, main_stream); } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 6d68ea0779a49..185bf11e795ab 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -756,43 +756,39 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - // TODO: remove duplicate variables - const float * src0_dd = static_cast(dst->src[0]->data); - const float * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - // TODO: remove duplicate variables - const float * src0_dd = static_cast(dst->src[0]->data); - const float * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - // TODO: remove duplicate variables - const float * src0_dd = static_cast(dst->src[0]->data); - const float * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - // TODO: remove duplicate variables - const float * src0_dd = static_cast(dst->src[0]->data); - const float * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 91c244579ff07..0c49cb54fd21a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2534,12 +2534,11 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - // TODO: remove duplicate variables - const float * src0_d = static_cast(dst->src[0]->data); - float * dst_d = static_cast(dst->data); + const void * src0_d = static_cast(dst->src[0]->data); + void * dst_d = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); - ggml_sycl_op_bin_bcast>(ctx, dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); + ggml_sycl_op_bin_bcast>(dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); } From 108be39dfe4de1436c2044a1255e38a394ac4b37 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Fri, 31 Jan 2025 20:10:44 +0530 Subject: [PATCH 03/40] binbcast: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/binbcast.cpp | 311 ++++++++++++++++++++++++++++ ggml/src/ggml-sycl/binbcast.hpp | 16 ++ ggml/src/ggml-sycl/common.hpp | 246 ---------------------- ggml/src/ggml-sycl/element_wise.cpp | 65 ------ ggml/src/ggml-sycl/element_wise.hpp | 30 --- ggml/src/ggml-sycl/ggml-sycl.cpp | 12 +- 7 files changed, 329 insertions(+), 352 deletions(-) create mode 100644 ggml/src/ggml-sycl/binbcast.cpp create mode 100644 ggml/src/ggml-sycl/binbcast.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index b1df4e5db1753..cdb89e392cb64 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -29,6 +29,7 @@ #include "wkv6.hpp" #include "outprod.hpp" #include "element_wise.hpp" +#include "binbcast.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp new file mode 100644 index 0000000000000..b2b113432598d --- /dev/null +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -0,0 +1,311 @@ +#include "binbcast.hpp" +#include "common.hpp" + +static __dpct_inline__ float op_repeat(const float a, const float b) { + return b; + GGML_UNUSED(a); +} + +static __dpct_inline__ float op_add(const float a, const float b) { + return a + b; +} + +static __dpct_inline__ float op_sub(const float a, const float b) { + return a - b; +} + +static __dpct_inline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __dpct_inline__ float op_div(const float a, const float b) { + return a / b; +} + +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, const sycl::nd_item<3> & item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + item_ct1.get_local_id(0)) / ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + item_ct1.get_local_id(0)) % ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3 * s3 + i2 * s2 + i1 * s1; + const size_t i_src1 = i13 * s13 + i12 * s12 + i11 * s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t) bin_op(src0 ? (float) src0_row[i0] : 0.0f, (float) src1_row[i10]); + } +} + +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, int ne0, int ne1, int ne2, + int ne3, int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + const int i3 = i / (ne2 * ne1 * ne0); + const int i2 = (i / (ne1 * ne0)) % ne2; + const int i1 = (i / ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3 * s3 + i2 * s2 + i1 * s1; + const size_t i_src1 = i13 * s13 + i12 * s12 + i11 * s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t) bin_op(src0 ? (float) src0_row[i0] : 0.0f, (float) src1_row[i10]); +} + +template struct bin_bcast_sycl { + template + void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, + const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, queue_ptr stream) { + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10 / ne0; + int nr1 = ne11 / ne1; + int nr2 = ne12 / ne2; + int nr3 = ne13 / ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne0[] = { ne0, ne1, ne2, ne3 }; + int64_t cne1[] = { ne10, ne11, ne12, ne13 }; + size_t cnb0[] = { nb0, nb1, nb2, nb3 }; + size_t cnb1[] = { nb10, nb11, nb12, nb13 }; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne0); + collapse(cne1); + } + } + { + int64_t ne0 = cne0[0]; + int64_t ne1 = cne0[1]; + int64_t ne2 = cne0[2]; + int64_t ne3 = cne0[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb0[0]; + size_t nb1 = cnb0[1]; + size_t nb2 = cnb0[2]; + size_t nb3 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0 / 2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = std::min(ne1, block_size / (unsigned int) block_dims[2]); + block_dims[0] = std::min(std::min(ne2 * ne3, block_size / (unsigned int) block_dims[2] / + (unsigned int) block_dims[1]), + 64U); + + sycl::range<3> block_nums((ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, + ne13, s1, s2, s3, s11, s12, s13, item_ct1); + }); + } + } else { + /* + DPCT1049:16: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, + ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1); + }); + } + } + } +}; + +template +inline void ggml_sycl_op_bin_bcast(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const void * src1_dd, void * dst_dd, + const queue_ptr & main_stream) { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const float *) src0_dd, (const float *) src1_dd, (float *) dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(src0, src1, dst, (const sycl::half *) src0_dd, (const float *) src1_dd, (sycl::half *) dst_dd, + main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const sycl::half *) src0_dd, (const float *) src1_dd, (float *) dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + op()(src0, src1, dst, (const int32_t *) src0_dd, (const int32_t *) src1_dd, (int32_t *) dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { + op()(src0, src1, dst, (const int16_t *) src0_dd, (const int16_t *) src1_dd, (int16_t *) dst_dd, main_stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} + +inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} + +inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} + +inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} + +inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} + +inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const void * src0_d = static_cast(dst->src[0]->data); + void * dst_d = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + + ggml_sycl_op_bin_bcast>(dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); +} + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_add(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_sub(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_mul(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_div(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_repeat(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/binbcast.hpp b/ggml/src/ggml-sycl/binbcast.hpp new file mode 100644 index 0000000000000..db8c8f55340a9 --- /dev/null +++ b/ggml/src/ggml-sycl/binbcast.hpp @@ -0,0 +1,16 @@ +#ifndef GGML_SYCL_BINBCAST_HPP +#define GGML_SYCL_BINBCAST_HPP + +#include "common.hpp" + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_BINBCAST_HPP diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index ae27787845d03..82719190730e0 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -427,252 +427,6 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); -template -static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) / - ne3; - const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) % - ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - for (int i0 = i0s; i0 < ne0; - i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); - } -} - -template -static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; - - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); -} - - -template -struct bin_bcast_sycl { - template - void operator()(const struct ggml_tensor *src0, - const struct ggml_tensor *src1, struct ggml_tensor *dst, - const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, - queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - int nr0 = ne10/ne0; - int nr1 = ne11/ne1; - int nr2 = ne12/ne2; - int nr3 = ne13/ne3; - - int nr[4] = { nr0, nr1, nr2, nr3 }; - - // collapse dimensions until first broadcast dimension - int64_t cne0[] = {ne0, ne1, ne2, ne3}; - int64_t cne1[] = {ne10, ne11, ne12, ne13}; - size_t cnb0[] = {nb0, nb1, nb2, nb3}; - size_t cnb1[] = {nb10, nb11, nb12, nb13}; - auto collapse = [](int64_t cne[]) { - cne[0] *= cne[1]; - cne[1] = cne[2]; - cne[2] = cne[3]; - cne[3] = 1; - }; - - auto collapse_nb = [](size_t cnb[], int64_t cne[]) { - cnb[1] *= cne[1]; - cnb[2] *= cne[2]; - cnb[3] *= cne[3]; - }; - - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne0); - collapse(cne1); - } - } - { - int64_t ne0 = cne0[0]; - int64_t ne1 = cne0[1]; - int64_t ne2 = cne0[2]; - int64_t ne3 = cne0[3]; - - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - - size_t nb0 = cnb0[0]; - size_t nb1 = cnb0[1]; - size_t nb2 = cnb0[2]; - size_t nb3 = cnb0[3]; - - size_t nb10 = cnb1[0]; - size_t nb11 = cnb1[1]; - size_t nb12 = cnb1[2]; - size_t nb13 = cnb1[3]; - - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - size_t s10 = nb10 / sizeof(src1_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - GGML_ASSERT(s0 == 1); - GGML_ASSERT(s10 == 1); - - const int block_size = 128; - - int64_t hne0 = std::max(ne0/2LL, 1LL); - - sycl::range<3> block_dims(1, 1, 1); - block_dims[2] = std::min(hne0, block_size); - block_dims[1] = std::min( - ne1, block_size / (unsigned int)block_dims[2]); - block_dims[0] = std::min( - std::min( - ne2 * ne3, block_size / (unsigned int)block_dims[2] / - (unsigned int)block_dims[1]), - 64U); - - sycl::range<3> block_nums( - (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], - (ne1 + block_dims[1] - 1) / block_dims[1], - (hne0 + block_dims[2] - 1) / block_dims[2]); - - if (block_nums[0] > 65535) { - // this is the maximum number of blocks in z direction, fallback to 1D grid kernel - int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * - sycl::range<3>(1, 1, block_size), - sycl::range<3>(1, 1, block_size)), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast_unravel( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, - s13, item_ct1); - }); - } - } else { - /* - DPCT1049:16: The work-group size passed to the SYCL kernel may - exceed the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if - needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, - ne2, ne3, ne10, ne11, ne12, ne13, - s1, s2, s3, s11, s12, s13, - item_ct1); - }); - } - } - } -}; - -template -inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const void *src0_dd, const void *src1_dd, - void *dst_dd, - const queue_ptr &main_stream) { - - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()(src0, src1, dst, (const sycl::half *)src0_dd, (const float *)src1_dd, - (sycl::half *)dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - op()(src0, src1, dst, (const sycl::half *)src0_dd, (const float *)src1_dd, (float *)dst_dd, - main_stream); - } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { - op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, - main_stream); - } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { - op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, - main_stream); - } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, - ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} - bool gpu_has_xmx(sycl::device &dev); // Some backend specific macros diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 185bf11e795ab..9682708fd503a 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -754,44 +754,6 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream); } -inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, - ggml_tensor *dst) { - const void * src0_dd = static_cast(dst->src[0]->data); - const void * src1_dd = static_cast(dst->src[1]->data); - void * dst_dd = static_cast(dst->data); - const dpct::queue_ptr main_stream = ctx.stream(); - - ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - const void * src0_dd = static_cast(dst->src[0]->data); - const void * src1_dd = static_cast(dst->src[1]->data); - void * dst_dd = static_cast(dst->data); - const dpct::queue_ptr main_stream = ctx.stream(); - - ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - const void * src0_dd = static_cast(dst->src[0]->data); - const void * src1_dd = static_cast(dst->src[1]->data); - void * dst_dd = static_cast(dst->data); - const dpct::queue_ptr main_stream = ctx.stream(); - - ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - const void * src0_dd = static_cast(dst->src[0]->data); - const void * src1_dd = static_cast(dst->src[1]->data); - void * dst_dd = static_cast(dst->data); - const dpct::queue_ptr main_stream = ctx.stream(); - - ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); -} - - void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_sqrt(ctx, dst); @@ -864,7 +826,6 @@ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s done\n", __func__); } - void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_exp(ctx, dst); @@ -912,29 +873,3 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_pad(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } - - - -void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_add(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_sub(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_mul(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_div(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 46443264505cc..6c3c3eef8455a 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -3,28 +3,6 @@ #include "common.hpp" -static __dpct_inline__ float op_repeat(const float a, const float b) { - return b; - GGML_UNUSED(a); -} - -static __dpct_inline__ float op_add(const float a, const float b) { - return a + b; -} - -static __dpct_inline__ float op_sub(const float a, const float b) { - return a - b; -} - -static __dpct_inline__ float op_mul(const float a, const float b) { - return a * b; -} - -static __dpct_inline__ float op_div(const float a, const float b) { - return a / b; -} - - void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); @@ -65,12 +43,4 @@ void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0c49cb54fd21a..ff51786ca6fbe 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2532,16 +2532,6 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor } } - -static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - const void * src0_d = static_cast(dst->src[0]->data); - void * dst_d = static_cast(dst->data); - dpct::queue_ptr main_stream = ctx.stream(); - - ggml_sycl_op_bin_bcast>(dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); -} - - inline void ggml_sycl_op_mul_mat_sycl( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -3877,7 +3867,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_op_conv_transpose_1d(ctx, dst); // already good break; case GGML_OP_REPEAT: - ggml_sycl_op_repeat(ctx, dst); // partially done + ggml_sycl_repeat(ctx, dst); // partially done break; case GGML_OP_GET_ROWS: ggml_sycl_op_get_rows(ctx, dst); // done From e1326a78979d38eef7673c9d5eedcabaf8dcebc6 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Fri, 31 Jan 2025 20:51:12 +0530 Subject: [PATCH 04/40] binbcast: add try catch sycl::exception --- ggml/src/ggml-sycl/binbcast.cpp | 27 +++++++++++++++++++++------ ggml/src/ggml-sycl/common.hpp | 1 + 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index b2b113432598d..b94b82e799b81 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -226,13 +226,13 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor * src0, const ggml_tensor * } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { op()(src0, src1, dst, (const int16_t *) src0_dd, (const int16_t *) src1_dd, (int16_t *) dst_dd, main_stream); } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), + GGML_LOG_ERROR("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); GGML_ABORT("fatal error"); } } -inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -240,9 +240,12 @@ inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -250,9 +253,12 @@ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -260,9 +266,12 @@ inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -270,14 +279,20 @@ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const void * src0_d = static_cast(dst->src[0]->data); void * dst_d = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); ggml_sycl_op_bin_bcast>(dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 82719190730e0..79ac6142a4d7e 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -31,6 +31,7 @@ #pragma clang diagnostic ignored "-Wnested-anon-types" #include "ggml-common.h" #pragma clang diagnostic pop +#include "ggml-impl.h" void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); From fa7c4d86f350b422bfd7b90e133d4dc5bf267744 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Fri, 31 Jan 2025 21:13:28 +0530 Subject: [PATCH 05/40] Fix GGML_SYCL_DEBUG in kernels in other files --- ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 79ac6142a4d7e..7afce5447c530 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -36,7 +36,7 @@ void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); -static int g_ggml_sycl_debug = 0; +extern int g_ggml_sycl_debug; #define GGML_SYCL_DEBUG(...) \ do { \ if (g_ggml_sycl_debug) \ diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ff51786ca6fbe..f618fef80f1bc 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -41,6 +41,7 @@ #include "ggml-sycl/gemm.hpp" static bool g_sycl_loaded = false; +int g_ggml_sycl_debug = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -158,8 +159,8 @@ static void ggml_check_sycl() try { static bool initialized = false; if (!initialized) { - GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: yes\n"); From 95a09ab5056efbf5c69bab16fcb4966827305918 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 09:22:25 +0530 Subject: [PATCH 06/40] ARGMAX: move to a separate file --- ggml/src/ggml-sycl/argmax.cpp | 73 ++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/argmax.hpp | 8 ++++ ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 68 ----------------------------- 4 files changed, 82 insertions(+), 68 deletions(-) create mode 100644 ggml/src/ggml-sycl/argmax.cpp create mode 100644 ggml/src/ggml-sycl/argmax.hpp diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp new file mode 100644 index 0000000000000..573a9dc6331c0 --- /dev/null +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -0,0 +1,73 @@ +#include "argmax.hpp" + +static void argmax_f32_i32_sycl(const float * x, int * dst, const int ncols, const int nrows, queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = 256 * sizeof(float); + + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor shared_data(sycl::range<1>(shared_mem / sizeof(float)), cgh); + sycl::local_accessor shared_indices(sycl::range<1>(shared_mem / sizeof(float)), cgh); + + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + const int tid = item_ct1.get_local_id(2); + const int row = item_ct1.get_global_id(1); + + float max_val = -INFINITY; + int max_idx = -1; + + for (int col = tid; col < ncols; col += 256) { + float val = x[row * ncols + col]; + if (val > max_val) { + max_val = val; + max_idx = col; + } + } + + shared_data[tid] = max_val; + shared_indices[tid] = max_idx; + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int stride = 256 / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + float val1 = shared_data[tid]; + float val2 = shared_data[tid + stride]; + if (val2 > val1) { + shared_data[tid] = val2; + shared_indices[tid] = shared_indices[tid + stride]; + } + } + item_ct1.barrier(sycl::access::fence_space::local_space); + } + + if (tid == 0) { + dst[row] = shared_indices[0]; + } + }); + }); +} + +void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_I32); + + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + int32_t * dst_dd = static_cast(dst->data); + argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_argmax(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} \ No newline at end of file diff --git a/ggml/src/ggml-sycl/argmax.hpp b/ggml/src/ggml-sycl/argmax.hpp new file mode 100644 index 0000000000000..9888e4c08b196 --- /dev/null +++ b/ggml/src/ggml-sycl/argmax.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_ARGMAX_HPP +#define GGML_SYCL_ARGMAX_HPP + +#include "common.hpp" + +void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ARGMAX_HPP \ No newline at end of file diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index cdb89e392cb64..05bc85ded9457 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -30,6 +30,7 @@ #include "outprod.hpp" #include "element_wise.hpp" #include "binbcast.hpp" +#include "argmax.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f618fef80f1bc..f9ea4258e9d30 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2347,58 +2347,6 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, } } -static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols, - const int nrows, queue_ptr stream) { - const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE); - const sycl::range<3> block_nums(1, nrows, 1); - const size_t shared_mem = 256 * sizeof(float); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor shared_data( - sycl::range<1>(shared_mem/sizeof(float)), cgh); - sycl::local_accessor shared_indices( - sycl::range<1>(shared_mem/sizeof(float)), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - const int tid = item_ct1.get_local_id(2); - const int row = item_ct1.get_global_id(1); - - float max_val = -INFINITY; - int max_idx = -1; - - for (int col = tid; col < ncols; col += 256) { - float val = x[row * ncols + col]; - if (val > max_val) { - max_val = val; - max_idx = col; - } - } - - shared_data[tid] = max_val; - shared_indices[tid] = max_idx; - item_ct1.barrier(sycl::access::fence_space::local_space); - - for (int stride = 256/2; stride > 0; stride >>= 1) { - if (tid < stride) { - float val1 = shared_data[tid]; - float val2 = shared_data[tid + stride]; - if (val2 > val1) { - shared_data[tid] = val2; - shared_indices[tid] = shared_indices[tid + stride]; - } - } - item_ct1.barrier(sycl::access::fence_space::local_space); - } - - - if (tid == 0) { - dst[row] = shared_indices[0]; - } - }); - }); -} static void diag_mask_inf_f32_sycl(const float *x, float *dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, @@ -2746,22 +2694,6 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * argsort_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, order, main_stream); } -inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_I32); - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - - dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - int32_t * dst_dd = static_cast(dst->data); - - argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); -} - inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); From 5288bd58960ae25501deca21c8d02b4b289c643e Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 09:37:29 +0530 Subject: [PATCH 07/40] Argsort: move to a separate file --- ggml/src/ggml-sycl/argsort.cpp | 120 ++++++++++++++++++++++++++++ ggml/src/ggml-sycl/argsort.hpp | 8 ++ ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 129 ------------------------------- 4 files changed, 129 insertions(+), 129 deletions(-) create mode 100644 ggml/src/ggml-sycl/argsort.cpp create mode 100644 ggml/src/ggml-sycl/argsort.hpp diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp new file mode 100644 index 0000000000000..74cb0afd696ea --- /dev/null +++ b/ggml/src/ggml-sycl/argsort.cpp @@ -0,0 +1,120 @@ +#include "argsort.hpp" + +template +static inline void ggml_sycl_swap(T & a, T & b) { + T tmp = a; + a = b; + b = tmp; +} + +template +__dpct_inline__ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad, + const sycl::nd_item<3> & item_ct1, uint8_t * dpct_local) { + // bitonic sort + int col = item_ct1.get_local_id(2); + int row = item_ct1.get_group(1); + + if (col >= ncols_pad) { + return; + } + + const float * x_row = x + row * ncols; + auto dst_row = (int *) dpct_local; + + // initialize indices + dst_row[col] = col; + + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int k = 2; k <= ncols_pad; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && + (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]]))) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); + } + } else { + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && + (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]]))) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); + } + } + } + /* + DPCT1118:1: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + } + } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } +} + +static void argsort_f32_i32_sycl(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, + queue_ptr stream) { + // bitonic sort requires ncols to be power of 2 + const int ncols_pad = next_power_of_2(ncols); + + const sycl::range<3> block_dims(1, 1, ncols_pad); + const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + if (order == GGML_SORT_ORDER_ASC) { + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor dpct_local_acc_ct1(sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr().get()); + }); + }); + } else if (order == GGML_SORT_ORDER_DESC) { + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor dpct_local_acc_ct1(sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr().get()); + }); + }); + } else { + GGML_ABORT("fatal error"); + } +} + +inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_I32); + + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + int32_t * dst_dd = static_cast(dst->data); + + argsort_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, order, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_argsort(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} \ No newline at end of file diff --git a/ggml/src/ggml-sycl/argsort.hpp b/ggml/src/ggml-sycl/argsort.hpp new file mode 100644 index 0000000000000..e79d20e8a7592 --- /dev/null +++ b/ggml/src/ggml-sycl/argsort.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_ARGSORT_HPP +#define GGML_SYCL_ARGSORT_HPP + +#include "common.hpp" + +void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ARGSORT_HPP diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 05bc85ded9457..ece5449d633dd 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -31,6 +31,7 @@ #include "element_wise.hpp" #include "binbcast.hpp" #include "argmax.hpp" +#include "argsort.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f9ea4258e9d30..f4d606b4a2920 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1730,70 +1730,6 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } -template -static inline void ggml_sycl_swap(T & a, T & b) { - T tmp = a; - a = b; - b = tmp; -} - -template -__dpct_inline__ static void -k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad, - const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) { - // bitonic sort - int col = item_ct1.get_local_id(2); - int row = item_ct1.get_group(1); - - if (col >= ncols_pad) { - return; - } - - const float * x_row = x + row * ncols; - auto dst_row = (int *)dpct_local; - - // initialize indices - dst_row[col] = col; - - item_ct1.barrier(sycl::access::fence_space::local_space); - - for (int k = 2; k <= ncols_pad; k *= 2) { - for (int j = k / 2; j > 0; j /= 2) { - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (dst_row[col] >= ncols || - (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? - x_row[dst_row[col]] > x_row[dst_row[ixj]] : - x_row[dst_row[col]] < x_row[dst_row[ixj]])) - ) { - ggml_sycl_swap(dst_row[col], dst_row[ixj]); - } - } else { - if (dst_row[ixj] >= ncols || - (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? - x_row[dst_row[col]] < x_row[dst_row[ixj]] : - x_row[dst_row[col]] > x_row[dst_row[ixj]])) - ) { - ggml_sycl_swap(dst_row[col], dst_row[ixj]); - } - } - } - /* - DPCT1118:1: SYCL group functions and algorithms must be encountered - in converged control flow. You may need to adjust the code. - */ - item_ct1.barrier(sycl::access::fence_space::local_space); - } - } - - // copy the result to dst without the padding - if (col < ncols) { - dst[row * ncols + col] = dst_row[col]; - } -} - - static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, const sycl::nd_item<3> &item_ct1) { const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + @@ -2304,49 +2240,6 @@ static int next_power_of_2(int x) { return n; } -static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, - const int nrows, ggml_sort_order order, - queue_ptr stream) { - // bitonic sort requires ncols to be power of 2 - const int ncols_pad = next_power_of_2(ncols); - - const sycl::range<3> block_dims(1, 1, ncols_pad); - const sycl::range<3> block_nums(1, nrows, 1); - const size_t shared_mem = ncols_pad * sizeof(int); - - if (order == GGML_SORT_ORDER_ASC) { - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor dpct_local_acc_ct1( - sycl::range<1>(shared_mem), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32( - x, dst, ncols, ncols_pad, item_ct1, - dpct_local_acc_ct1.get_multi_ptr() - .get()); - }); - }); - } else if (order == GGML_SORT_ORDER_DESC) { - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor dpct_local_acc_ct1( - sycl::range<1>(shared_mem), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32( - x, dst, ncols, ncols_pad, item_ct1, - dpct_local_acc_ct1.get_multi_ptr() - .get()); - }); - }); - } else { - GGML_ABORT("fatal error"); - } -} - static void diag_mask_inf_f32_sycl(const float *x, float *dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, @@ -2678,22 +2571,6 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); } -inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_I32); - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - int32_t * dst_dd = static_cast(dst->data); - - argsort_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, order, main_stream); -} - inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); @@ -3758,12 +3635,6 @@ static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * ds ggml_sycl_op_sum_rows(ctx, dst); } -static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_argsort(ctx, dst); -} - - void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == static_cast (main_device)) { return; From a153f1972d1924a7f47d8ce30fcb52df3e012b3d Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 09:40:44 +0530 Subject: [PATCH 08/40] ggml_sycl_compute_forward: fixup function calling names and remove comments --- ggml/src/ggml-sycl/ggml-sycl.cpp | 100 +++++++++++++++---------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f4d606b4a2920..0d771d61d8804 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3665,138 +3665,138 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens switch (dst->op) { case GGML_OP_ARGMAX: - ggml_sycl_op_argmax(ctx, dst); // done + ggml_sycl_argmax(ctx, dst); break; case GGML_OP_CONV_TRANSPOSE_1D: - ggml_sycl_op_conv_transpose_1d(ctx, dst); // already good + ggml_sycl_op_conv_transpose_1d(ctx, dst); break; case GGML_OP_REPEAT: - ggml_sycl_repeat(ctx, dst); // partially done + ggml_sycl_repeat(ctx, dst); break; case GGML_OP_GET_ROWS: - ggml_sycl_op_get_rows(ctx, dst); // done + ggml_sycl_op_get_rows(ctx, dst); break; case GGML_OP_DUP: - ggml_sycl_dup(ctx, dst); // done + ggml_sycl_dup(ctx, dst); break; case GGML_OP_ADD: case GGML_OP_ADD1: // TODO: more efficient implementation - ggml_sycl_add(ctx, dst); // partially done + ggml_sycl_add(ctx, dst); break; case GGML_OP_SUB: - ggml_sycl_sub(ctx, dst); // partially done + ggml_sycl_sub(ctx, dst); break; case GGML_OP_ACC: - ggml_sycl_acc(ctx, dst); // fully done + ggml_sycl_acc(ctx, dst); break; case GGML_OP_MUL: - ggml_sycl_mul(ctx, dst); // partially done + ggml_sycl_mul(ctx, dst); break; case GGML_OP_LOG: - ggml_sycl_log(ctx, dst); // fully done + ggml_sycl_log(ctx, dst); break; case GGML_OP_DIV: - ggml_sycl_div(ctx, dst); // partially done + ggml_sycl_div(ctx, dst); break; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_NEG: - ggml_sycl_neg(ctx, dst); // done + ggml_sycl_neg(ctx, dst); break; case GGML_UNARY_OP_STEP: - ggml_sycl_step(ctx, dst); // done + ggml_sycl_step(ctx, dst); break; case GGML_UNARY_OP_GELU: - ggml_sycl_gelu(ctx, dst); // done + ggml_sycl_gelu(ctx, dst); break; case GGML_UNARY_OP_SILU: - ggml_sycl_silu(ctx, dst); // done + ggml_sycl_silu(ctx, dst); break; case GGML_UNARY_OP_GELU_QUICK: - ggml_sycl_gelu_quick(ctx, dst); // done + ggml_sycl_gelu_quick(ctx, dst); break; case GGML_UNARY_OP_TANH: - ggml_sycl_tanh(ctx, dst); // done + ggml_sycl_tanh(ctx, dst); break; case GGML_UNARY_OP_RELU: - ggml_sycl_relu(ctx, dst); // done + ggml_sycl_relu(ctx, dst); break; case GGML_UNARY_OP_SIGMOID: - ggml_sycl_sigmoid(ctx, dst); // done + ggml_sycl_sigmoid(ctx, dst); break; case GGML_UNARY_OP_HARDSIGMOID: - ggml_sycl_hardsigmoid(ctx, dst); // done + ggml_sycl_hardsigmoid(ctx, dst); break; case GGML_UNARY_OP_HARDSWISH: - ggml_sycl_hardswish(ctx, dst); // done + ggml_sycl_hardswish(ctx, dst); break; case GGML_UNARY_OP_EXP: - ggml_sycl_exp(ctx, dst); // done + ggml_sycl_exp(ctx, dst); break; default: return false; } break; case GGML_OP_NORM: - ggml_sycl_norm(ctx, dst); // done + ggml_sycl_norm(ctx, dst); break; case GGML_OP_GROUP_NORM: - ggml_sycl_group_norm(ctx, dst); // done + ggml_sycl_group_norm(ctx, dst); break; case GGML_OP_CONCAT: - ggml_sycl_op_concat(ctx, dst); // already good + ggml_sycl_op_concat(ctx, dst); break; case GGML_OP_UPSCALE: - ggml_sycl_upscale(ctx, dst); // done + ggml_sycl_upscale(ctx, dst); break; case GGML_OP_PAD: - ggml_sycl_pad(ctx, dst); // done + ggml_sycl_pad(ctx, dst); break; case GGML_OP_LEAKY_RELU: - ggml_sycl_leaky_relu(ctx, dst); // done + ggml_sycl_leaky_relu(ctx, dst); break; case GGML_OP_RMS_NORM: - ggml_sycl_rms_norm(ctx, dst); // done + ggml_sycl_rms_norm(ctx, dst); break; case GGML_OP_MUL_MAT: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; } /* ggml_sycl_mul_mat_id is dependent on ggml_sycl_mul_mat */ - ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst); // good + ggml_sycl_mul_mat(ctx, dst->src[0], dst->src[1], dst); break; case GGML_OP_MUL_MAT_ID: if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { return false; } - ggml_sycl_mul_mat_id(ctx, dst); // good + ggml_sycl_mul_mat_id(ctx, dst); break; case GGML_OP_OUT_PROD: - ggml_sycl_op_out_prod(ctx, dst); // good + ggml_sycl_op_out_prod(ctx, dst); break; case GGML_OP_SCALE: - ggml_sycl_scale(ctx, dst); // done + ggml_sycl_scale(ctx, dst); break; case GGML_OP_SQR: - ggml_sycl_sqr(ctx, dst); // done + ggml_sycl_sqr(ctx, dst); break; case GGML_OP_SQRT: - ggml_sycl_sqrt(ctx, dst); // done + ggml_sycl_sqrt(ctx, dst); break; case GGML_OP_SIN: - ggml_sycl_sin(ctx, dst); //done + ggml_sycl_sin(ctx, dst); break; case GGML_OP_COS: - ggml_sycl_cos(ctx, dst); // done + ggml_sycl_cos(ctx, dst); break; case GGML_OP_CLAMP: - ggml_sycl_clamp(ctx, dst); // done + ggml_sycl_clamp(ctx, dst); break; case GGML_OP_CPY: - ggml_sycl_cpy(ctx, dst->src[0], dst->src[1]); // okayish, need check + ggml_sycl_cpy(ctx, dst->src[0], dst->src[1]); break; case GGML_OP_CONT: - ggml_sycl_dup(ctx, dst); // done + ggml_sycl_dup(ctx, dst); break; case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -3806,34 +3806,34 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens GGML_SYCL_DEBUG("%s: Tensor NO-OP\n", __func__); break; case GGML_OP_DIAG_MASK_INF: - ggml_sycl_diag_mask_inf(ctx, dst); // done + ggml_sycl_diag_mask_inf(ctx, dst); break; case GGML_OP_SOFT_MAX: - ggml_sycl_op_soft_max(ctx, dst); // already good + ggml_sycl_op_soft_max(ctx, dst); break; case GGML_OP_ROPE: - ggml_sycl_rope(ctx, dst); // done + ggml_sycl_rope(ctx, dst); break; case GGML_OP_IM2COL: - ggml_sycl_im2col(ctx, dst); // done + ggml_sycl_im2col(ctx, dst); break; case GGML_OP_POOL_2D: - ggml_sycl_pool2d(ctx, dst); // done + ggml_sycl_pool2d(ctx, dst); break; case GGML_OP_SUM: - ggml_sycl_sum(ctx, dst); // done + ggml_sycl_sum(ctx, dst); break; case GGML_OP_SUM_ROWS: - ggml_sycl_sum_rows(ctx, dst); // done + ggml_sycl_sum_rows(ctx, dst); break; case GGML_OP_ARGSORT: - ggml_sycl_argsort(ctx, dst); // done + ggml_sycl_argsort(ctx, dst); break; case GGML_OP_TIMESTEP_EMBEDDING: - ggml_sycl_op_timestep_embedding(ctx, dst); // already pretty good + ggml_sycl_op_timestep_embedding(ctx, dst); break; case GGML_OP_RWKV_WKV6: - ggml_sycl_op_rwkv_wkv6(ctx, dst); // good + ggml_sycl_op_rwkv_wkv6(ctx, dst); break; case GGML_OP_GATED_LINEAR_ATTN: ggml_sycl_op_gated_linear_attn(ctx, dst); From 51bedb847ec21ebcc718015228b68210905fc6e6 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 09:44:30 +0530 Subject: [PATCH 09/40] argmax: move missing function to file and fix function name --- ggml/src/ggml-sycl/argmax.cpp | 2 +- ggml/src/ggml-sycl/argmax.hpp | 2 +- ggml/src/ggml-sycl/argsort.cpp | 8 ++++++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 8 -------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp index 573a9dc6331c0..946565f87aeb8 100644 --- a/ggml/src/ggml-sycl/argmax.cpp +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -47,7 +47,7 @@ static void argmax_f32_i32_sycl(const float * x, int * dst, const int ncols, con }); } -void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { +static void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-sycl/argmax.hpp b/ggml/src/ggml-sycl/argmax.hpp index 9888e4c08b196..9093528f23bf8 100644 --- a/ggml/src/ggml-sycl/argmax.hpp +++ b/ggml/src/ggml-sycl/argmax.hpp @@ -3,6 +3,6 @@ #include "common.hpp" -void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ARGMAX_HPP \ No newline at end of file diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp index 74cb0afd696ea..8047f7d478ca5 100644 --- a/ggml/src/ggml-sycl/argsort.cpp +++ b/ggml/src/ggml-sycl/argsort.cpp @@ -1,5 +1,13 @@ #include "argsort.hpp" +static int next_power_of_2(int x) { + int n = 1; + while (n < x) { + n *= 2; + } + return n; +} + template static inline void ggml_sycl_swap(T & a, T & b) { T tmp = a; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0d771d61d8804..803ea6c237047 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2232,14 +2232,6 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, }); } -static int next_power_of_2(int x) { - int n = 1; - while (n < x) { - n *= 2; - } - return n; -} - static void diag_mask_inf_f32_sycl(const float *x, float *dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, From 3a346592b879954ed2f291e013e06862b75b4053 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 09:48:29 +0530 Subject: [PATCH 10/40] argsort: add a space at the end of file --- ggml/src/ggml-sycl/argsort.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp index 8047f7d478ca5..9c88d7323cec0 100644 --- a/ggml/src/ggml-sycl/argsort.cpp +++ b/ggml/src/ggml-sycl/argsort.cpp @@ -125,4 +125,4 @@ void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_argsort(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); -} \ No newline at end of file +} From aaf9ed070d5fe45ab1564ba68f2b9455c28638a5 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 09:49:42 +0530 Subject: [PATCH 11/40] Add spaces --- ggml/src/ggml-sycl/argmax.cpp | 2 +- ggml/src/ggml-sycl/argmax.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp index 946565f87aeb8..11119b56c467e 100644 --- a/ggml/src/ggml-sycl/argmax.cpp +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -70,4 +70,4 @@ void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_argmax(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); -} \ No newline at end of file +} diff --git a/ggml/src/ggml-sycl/argmax.hpp b/ggml/src/ggml-sycl/argmax.hpp index 9093528f23bf8..431a7d6e71b0d 100644 --- a/ggml/src/ggml-sycl/argmax.hpp +++ b/ggml/src/ggml-sycl/argmax.hpp @@ -5,4 +5,4 @@ void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -#endif // GGML_SYCL_ARGMAX_HPP \ No newline at end of file +#endif // GGML_SYCL_ARGMAX_HPP From a16b6b7681a38380c09a04fc907e520bd52ce24d Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 10:59:28 +0530 Subject: [PATCH 12/40] eltwise: sort includes --- ggml/src/ggml-sycl/common.cpp | 3 --- ggml/src/ggml-sycl/common.hpp | 2 ++ ggml/src/ggml-sycl/element_wise.cpp | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 9260a58c26278..3cdc762236d9c 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" - int get_current_device_id() { return dpct::dev_mgr::instance().current_device_id(); } diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 7afce5447c530..38f5cda7297f3 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -31,7 +31,9 @@ #pragma clang diagnostic ignored "-Wnested-anon-types" #include "ggml-common.h" #pragma clang diagnostic pop +#include "ggml-backend-impl.h" #include "ggml-impl.h" +#include "ggml.h" void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 9682708fd503a..6d95feec05e0d 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,6 +1,4 @@ -#include "common.hpp" #include "element_wise.hpp" -#include "ggml.h" void acc_f32(const float * x, const float * y, float * dst, const int ne, const int ne10, const int ne11, const int ne12, From ecacff3f6e00e35e064d976aab43ef6a297aad9a Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 11:21:09 +0530 Subject: [PATCH 13/40] CPY: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/cpy.cpp | 389 ++++++++++++++++++++++++++ ggml/src/ggml-sycl/cpy.hpp | 11 + ggml/src/ggml-sycl/ggml-sycl.cpp | 460 ------------------------------- 4 files changed, 401 insertions(+), 460 deletions(-) create mode 100644 ggml/src/ggml-sycl/cpy.cpp create mode 100644 ggml/src/ggml-sycl/cpy.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index ece5449d633dd..38e4c56ce9f6c 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -32,6 +32,7 @@ #include "binbcast.hpp" #include "argmax.hpp" #include "argsort.hpp" +#include "cpy.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp new file mode 100644 index 0000000000000..061fc06848b76 --- /dev/null +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -0,0 +1,389 @@ +#include "cpy.hpp" + +static void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = sycl::vec(*xi).convert()[0]; +} + +static void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i16_i16(const char * cxi, char * cdsti) { + const int16_t * xi = (const int16_t *) cxi; + int16_t * dsti = (int16_t *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i32_i32(const char * cxi, char * cdsti) { + const int32_t * xi = (const int32_t *) cxi; + int32_t * dsti = (int32_t *) cdsti; + + *dsti = *xi; +} + +template +static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float) v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j] * id; + + dsti->qs[j] = sycl::round((float) x0); + } +} + +static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) { + vmin = v; + } + if (v > vmax) { + vmax = v; + } + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1 / 2; ++j) { + const float x0 = (xi[0 + j] - vmin) * id; + const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +template +static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS; + + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + queue_ptr main_stream = ctx.stream(); + + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { + ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else { + GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp new file mode 100644 index 0000000000000..6be62076fd963 --- /dev/null +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -0,0 +1,11 @@ +#ifndef GGML_SYCL_CPY_HPP +#define GGML_SYCL_CPY_HPP + +#include "common.hpp" +#include + +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); + +#endif // GGML_SYCL_CPY_HPP \ No newline at end of file diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 803ea6c237047..c147b6ec05f7e 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1265,8 +1265,6 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(q // struct ggml_sycl_pool_vmm : public ggml_sycl_pool /// kernels - -typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_sycl_op_mul_mat_t)( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -1525,193 +1523,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous } } -static void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - sycl::half *dsti = (sycl::half *)cdsti; - - *dsti = sycl::vec(*xi) - .convert()[0]; -} - -static void cpy_1_f16_f16(const char * cxi, char * cdsti) { - const sycl::half *xi = (const sycl::half *)cxi; - sycl::half *dsti = (sycl::half *)cdsti; - - *dsti = *xi; -} - -static void cpy_1_f16_f32(const char * cxi, char * cdsti) { - const sycl::half *xi = (const sycl::half *)cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_i16_i16(const char * cxi, char * cdsti) { - const int16_t *xi = (const int16_t *)cxi; - int16_t *dsti = (int16_t *)cdsti; - - *dsti = *xi; -} - -static void cpy_1_i32_i32(const char * cxi, char * cdsti) { - const int32_t *xi = (const int32_t *)cxi; - int32_t *dsti = (int32_t *)cdsti; - - *dsti = *xi; -} - -template -static void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= ne) { - return; - } - - // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor - // then combine those indices with the corresponding byte offsets to get the total offsets - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; - - cpy_1(cx + x_offset, cdst + dst_offset); -} - -static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = sycl::fmax(amax, sycl::fabs((float)v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j]*id; - - dsti->qs[j] = sycl::round((float)x0); - } -} - -static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float)v)) { - amax = sycl::fabs((float)v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_0/2 + j]*id; - - const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - if (v < vmin) vmin = v; - if (v > vmax) vmax = v; - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = vmin; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (xi[0 + j] - vmin)*id; - const float x1 = (xi[QK4_1/2 + j] - vmin)*id; - - const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -template -static void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { - const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2)) * - qk; - - if (i >= ne) { - return; - } - - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; - - cpy_blck(cx + x_offset, cdst + dst_offset); -} - static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(1); @@ -1970,232 +1781,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } -static void -ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, - const int ne01, const int ne02, const int nb00, - const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, - const int nb10, const int nb11, const int nb12, - const int nb13, queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, - nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK8_0 == 0); - const int num_blocks = ne / QK8_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK4_0 == 0); - const int num_blocks = ne / QK4_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK4_1 == 0); - const int num_blocks = ne / QK4_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - static void scale_f32_sycl(const float *x, float *dst, const float scale, const int k, queue_ptr stream) { const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; @@ -3550,51 +3135,6 @@ static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_clamp(ctx, dst); } -static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { - const int64_t ne = ggml_nelements(src0); - GGML_ASSERT(ne == ggml_nelements(src1)); - - GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); - GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - - GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS; - - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - queue_ptr main_stream = ctx.stream(); - - char * src0_ddc = (char *) src0->data; - char * src1_ddc = (char *) src1->data; - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { - ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else { - GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, - ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { // TODO: why do we pass dst as src1 here? ggml_sycl_cpy(ctx, dst->src[0], dst); From 7d8d689d394cd7aae5bb808cce22a06efd193b7e Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 11:45:18 +0530 Subject: [PATCH 14/40] eltwise: add back split buffer type checks --- ggml/src/ggml-sycl/element_wise.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 6d95feec05e0d..70a6de470308c 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -512,6 +512,7 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -523,6 +524,7 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -534,6 +536,7 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -545,6 +548,7 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); @@ -556,6 +560,7 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -566,6 +571,7 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -576,6 +582,7 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); @@ -588,6 +595,7 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -598,6 +606,7 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -609,6 +618,7 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *d GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -620,6 +630,7 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -630,6 +641,7 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -641,7 +653,7 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -653,6 +665,7 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); @@ -664,6 +677,7 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); @@ -675,6 +689,7 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); float negative_slope; memcpy(&negative_slope, dst->op_params, sizeof(float)); @@ -690,6 +705,7 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); @@ -701,6 +717,7 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const float sf0 = (float)dst->ne[0]/dst->src[0]->ne[0]; const float sf1 = (float)dst->ne[1]/dst->src[0]->ne[1]; @@ -721,6 +738,7 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -738,6 +756,8 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); const float * src0_dd = static_cast(dst->src[0]->data); From 04d8b038b8a073bea392c3c63bb81ee3bfc76d5b Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 12:06:58 +0530 Subject: [PATCH 15/40] Add back split buffer type checks --- ggml/src/ggml-sycl/argmax.cpp | 1 + ggml/src/ggml-sycl/cpy.hpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 8 ++++++++ ggml/src/ggml-sycl/im2col.cpp | 1 + ggml/src/ggml-sycl/norm.cpp | 3 +++ ggml/src/ggml-sycl/rope.cpp | 2 ++ 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp index 11119b56c467e..76bc6f28ca7b1 100644 --- a/ggml/src/ggml-sycl/argmax.cpp +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -52,6 +52,7 @@ static void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * d GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_I32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index 6be62076fd963..1fbc7b75cc4f8 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -8,4 +8,4 @@ typedef void (*cpy_kernel_t)(const char * cx, char * cdst); void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); -#endif // GGML_SYCL_CPY_HPP \ No newline at end of file +#endif // GGML_SYCL_CPY_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index c147b6ec05f7e..c5066b5642c74 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1920,6 +1920,8 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); switch (dst->src[0]->type) { case GGML_TYPE_F16: @@ -2087,6 +2089,7 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *d GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = static_cast(opts[0]); @@ -2125,6 +2128,7 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *d inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne = ggml_nelements(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); @@ -2138,6 +2142,7 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); @@ -2152,6 +2157,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t ne01 = dst->src[0]->ne[1]; @@ -2169,6 +2175,7 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * ds GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); float scale; memcpy(&scale, dst->op_params, sizeof(float)); @@ -2189,6 +2196,7 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); float min; float max; diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index 4da9d12d8e5a4..d6b998b7e98a2 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -86,6 +86,7 @@ void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 628bdfa4dbc47..8b096b998df3f 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -315,6 +315,7 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); @@ -333,6 +334,7 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); int num_groups = dst->op_params[0]; @@ -350,6 +352,7 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 2a6c3ca7554da..59fcb9f6cf369 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -197,6 +197,8 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[0]->type == dst->type); + GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t ne01 = dst->src[0]->ne[1]; From 98f5fd2fd1162eaa461c14b5ef077c0f5b89a26c Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 19:08:42 +0530 Subject: [PATCH 16/40] getrows: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/getrows.cpp | 165 ++++++++++++++++++++++++++ ggml/src/ggml-sycl/getrows.hpp | 8 ++ ggml/src/ggml-sycl/ggml-sycl.cpp | 191 ------------------------------- 4 files changed, 174 insertions(+), 191 deletions(-) create mode 100644 ggml/src/ggml-sycl/getrows.cpp create mode 100644 ggml/src/ggml-sycl/getrows.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 38e4c56ce9f6c..24cf492b3d93c 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -33,6 +33,7 @@ #include "argmax.hpp" #include "argsort.hpp" #include "cpy.hpp" +#include "getrows.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp new file mode 100644 index 0000000000000..501c1f7a6a646 --- /dev/null +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -0,0 +1,165 @@ +#include "getrows.hpp" +#include "dequantize.hpp" + +template +static void k_get_rows(const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> & item_ct1 /*, size_t s13*/) { + const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2)) * 2; + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) / ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) % ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10 * s10 + i11 * s11 + i12 * s12]; + + dst_t * dst_row = dst + i10 * s1 + i11 * s2 + i12 * s3; + const void * src0_row = (const char *) src0 + i01 * nb01 + i11 * nb02 + i12 * nb03; + + const int ib = i00 / qk; // block index + const int iqs = (i00 % qk) / qr; // quant index + const int iybs = i00 - i00 % qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk / 2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x(); + dst_row[iybs + iqs + y_offset] = v.y(); +} + +template +static void k_get_rows_float(const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> & item_ct1 /*, size_t s13*/) { + const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2); + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) / ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) % ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10 * s10 + i11 * s11 + i12 * s12]; + + dst_t * dst_row = dst + i10 * s1 + i11 * s2 + i12 * s3; + const src0_t * src0_row = (const src0_t *) ((const char *) src0 + i01 * nb01 + i11 * nb02 + i12 * nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void get_rows_sycl(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + 2 * SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2 * SYCL_GET_ROWS_BLOCK_SIZE); + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(dst->src[1]); + const size_t s11 = nb11 / ggml_element_size(dst->src[1]); + const size_t s12 = nb12 / ggml_element_size(dst->src[1]); + //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); + + GGML_ASSERT(ne00 % 2 == 0); + const void * src0_dd = dst->src[0]->data; + const int32_t * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr stream = ctx.stream(); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_get_rows(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, + item_ct1); + }); +} + +template static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(dst->src[1]); + const size_t s11 = nb11 / ggml_element_size(dst->src[1]); + const size_t s12 = nb12 / ggml_element_size(dst->src[1]); + //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); + const src0_t * src0_dd = static_cast(dst->src[0]->data); + const int32_t * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr stream = ctx.stream(); + + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, + item_ct1); + }); + } +} + +void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); + GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + switch (dst->src[0]->type) { + case GGML_TYPE_F16: + get_rows_sycl_float(ctx, dst); + break; + case GGML_TYPE_F32: + get_rows_sycl_float(ctx, dst); + break; + case GGML_TYPE_Q4_0: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q4_1: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q5_0: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q5_1: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q8_0: + get_rows_sycl(ctx, dst); + break; + default: + // TODO: k-quants + GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type)); + GGML_ABORT("fatal error"); + break; + } +} diff --git a/ggml/src/ggml-sycl/getrows.hpp b/ggml/src/ggml-sycl/getrows.hpp new file mode 100644 index 0000000000000..285c56af32a0f --- /dev/null +++ b/ggml/src/ggml-sycl/getrows.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_GETROWS_HPP +#define GGML_SYCL_GETROWS_HPP + +#include "common.hpp" + +void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_GETROWS_HPP \ No newline at end of file diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index c5066b5642c74..93f93619954ef 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1336,83 +1336,6 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, reinterpret_cast(y[ib].ds.y()) = sum; } -template -static void k_get_rows( - const void * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2)) * - 2; - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; - - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(src0_row, ib, iqs, v); - - dst_row[iybs + iqs + 0] = v.x(); - dst_row[iybs + iqs + y_offset] = v.y(); -} - -template -static void k_get_rows_float( - const src0_t * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2); - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = src0_row[i00]; -} - static void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, @@ -1644,80 +1567,6 @@ static void pool2d_nchw_kernel( o_ptr[cur_oh * ow + cur_ow] = res; } -template -static void get_rows_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - GGML_SYCL_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE); - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(dst->src[1]); - const size_t s11 = nb11 / ggml_element_size(dst->src[1]); - const size_t s12 = nb12 / ggml_element_size(dst->src[1]); - //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); - - GGML_ASSERT(ne00 % 2 == 0); - const void * src0_dd = dst->src[0]->data; - const int32_t * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); - - dpct::queue_ptr stream = ctx.stream(); - - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows( - src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - -} - -template -static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - - GGML_SYCL_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(dst->src[1]); - const size_t s11 = nb11 / ggml_element_size(dst->src[1]); - const size_t s12 = nb12 / ggml_element_size(dst->src[1]); - //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); - const src0_t * src0_dd = static_cast(dst->src[0]->data); - const int32_t * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); - - dpct::queue_ptr stream = ctx.stream(); - - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - } -} - static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, const int ky, const int kx_padded, queue_ptr stream) { @@ -1912,46 +1761,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); - GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - switch (dst->src[0]->type) { - case GGML_TYPE_F16: - get_rows_sycl_float(ctx, dst); - break; - case GGML_TYPE_F32: - get_rows_sycl_float(ctx, dst); - break; - case GGML_TYPE_Q4_0: - get_rows_sycl(ctx, dst); - break; - case GGML_TYPE_Q4_1: - get_rows_sycl(ctx, dst); - break; - case GGML_TYPE_Q5_0: - get_rows_sycl(ctx, dst); - break; - case GGML_TYPE_Q5_1: - get_rows_sycl(ctx, dst); - break; - case GGML_TYPE_Q8_0: - get_rows_sycl(ctx, dst); - break; - default: - // TODO: k-quants - GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type)); - GGML_ABORT("fatal error"); - break; - } -} inline void ggml_sycl_op_mul_mat_sycl( ggml_backend_sycl_context & ctx, From 8e86732cf272b343fde55819f0a3ecaa3df860d9 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sat, 1 Feb 2025 19:33:52 +0530 Subject: [PATCH 17/40] diagmask: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/diagmask.cpp | 53 ++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/diagmask.hpp | 8 +++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 55 -------------------------------- 4 files changed, 62 insertions(+), 55 deletions(-) create mode 100644 ggml/src/ggml-sycl/diagmask.cpp create mode 100644 ggml/src/ggml-sycl/diagmask.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 24cf492b3d93c..92519caf8ea8e 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -34,6 +34,7 @@ #include "argsort.hpp" #include "cpy.hpp" #include "getrows.hpp" +#include "diagmask.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/diagmask.cpp b/ggml/src/ggml-sycl/diagmask.cpp new file mode 100644 index 0000000000000..821c8c69958e8 --- /dev/null +++ b/ggml/src/ggml-sycl/diagmask.cpp @@ -0,0 +1,53 @@ +#include "diagmask.hpp" +#include + +static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, + const int n_past, const sycl::nd_item<3> & item_ct1) { + const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1); + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (col >= ncols) { + return; + } + + const int i = row * ncols + col; + //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; + //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +} + +static void diag_mask_inf_f32_sycl(const float * x, float * dst, const int ncols_x, const int nrows_x, + const int rows_per_channel, const int n_past, queue_ptr stream) { + const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE; + const sycl::range<3> block_nums(1, block_num_x, nrows_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + diag_mask_inf_f32(x, dst, ncols_x, rows_per_channel, n_past, item_ct1); + }); +} + +inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t ne01 = dst->src[0]->ne[1]; + const int nrows0 = ggml_nrows(dst->src[0]); + + const int n_past = ((int32_t *) dst->op_params)[0]; + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_diag_mask_inf(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} \ No newline at end of file diff --git a/ggml/src/ggml-sycl/diagmask.hpp b/ggml/src/ggml-sycl/diagmask.hpp new file mode 100644 index 0000000000000..37954aedca75a --- /dev/null +++ b/ggml/src/ggml-sycl/diagmask.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_DIAG_MASK +#define GGML_SYCL_DIAG_MASK + +#include "common.hpp" + +void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_DIAG_MASK \ No newline at end of file diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 93f93619954ef..2e8b4852a1bbf 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1463,24 +1463,6 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } } - -static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, - const sycl::nd_item<3> &item_ct1) { - const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (col >= ncols) { - return; - } - - const int i = row*ncols + col; - //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; - //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU - dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; -} - static void scale_f32(const float * x, float * dst, const float scale, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -1666,21 +1648,6 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, }); } -static void diag_mask_inf_f32_sycl(const float *x, float *dst, - const int ncols_x, const int nrows_x, - const int rows_per_channel, const int n_past, - queue_ptr stream) { - const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1); - const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE; - const sycl::range<3> block_nums(1, block_num_x, nrows_x); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - diag_mask_inf_f32(x, dst, ncols_x, - rows_per_channel, n_past, - item_ct1); - }); -} - static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, const struct ggml_tensor *src, int64_t i3, int64_t i2, @@ -1962,24 +1929,6 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); } -inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - const int64_t ne00 = dst->src[0]->ne[0]; - const int64_t ne01 = dst->src[0]->ne[1]; - const int nrows0 = ggml_nrows(dst->src[0]); - - const int n_past = ((int32_t *) dst->op_params)[0]; - dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); -} - inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); @@ -2957,10 +2906,6 @@ static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_cpy(ctx, dst->src[0], dst); } -static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_diag_mask_inf(ctx, dst); -} - static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented ggml_sycl_op_rope(ctx, dst); From 7f2d24fdca3b4d0d4cab53ab991a75cdfbdddc26 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 11:21:36 +0530 Subject: [PATCH 18/40] rope: add try catch sycl exception and debug log --- ggml/src/ggml-sycl/ggml-sycl.cpp | 5 ----- ggml/src/ggml-sycl/rope.cpp | 13 ++++++++++++- ggml/src/ggml-sycl/rope.hpp | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 2e8b4852a1bbf..7059c46d6843d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2906,11 +2906,6 @@ static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_cpy(ctx, dst->src[0], dst); } -static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented - ggml_sycl_op_rope(ctx, dst); -} - static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_pool2d(ctx, dst); } diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 59fcb9f6cf369..2a8a7c778da08 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -192,7 +192,7 @@ static void rope_neox_sycl( } } -void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -272,4 +272,15 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ABORT("fatal error"); } } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT( + ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_rope(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp index dd15ac6d8967f..b2824c510ee37 100644 --- a/ggml/src/ggml-sycl/rope.hpp +++ b/ggml/src/ggml-sycl/rope.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ROPE_HPP From 927925ffe2987fc416f1eae0ef035133aa310a2f Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 11:34:37 +0530 Subject: [PATCH 19/40] scale: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 49 -------------------------------- ggml/src/ggml-sycl/scale.cpp | 48 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/scale.hpp | 8 ++++++ 4 files changed, 57 insertions(+), 49 deletions(-) create mode 100644 ggml/src/ggml-sycl/scale.cpp create mode 100644 ggml/src/ggml-sycl/scale.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 92519caf8ea8e..6923214d58b01 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -35,6 +35,7 @@ #include "cpy.hpp" #include "getrows.hpp" #include "diagmask.hpp" +#include "scale.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 7059c46d6843d..bf1c14971810b 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1463,18 +1463,6 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } } -static void scale_f32(const float * x, float * dst, const float scale, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - dst[i] = scale * x[i]; -} - static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -1612,18 +1600,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } -static void scale_f32_sycl(const float *x, float *dst, const float scale, - const int k, queue_ptr stream) { - const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - scale_f32(x, dst, scale, k, item_ct1); - }); -} - static void clamp_f32_sycl(const float *x, float *dst, const float min, const float max, const int k, queue_ptr stream) { @@ -1929,27 +1905,6 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); } -inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - float scale; - memcpy(&scale, dst->op_params, sizeof(float)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - dpct::queue_ptr main_stream = ctx.stream(); - - scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream); - /* - DPCT1010:87: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - SYCL_CHECK(0); -} - inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); @@ -2893,10 +2848,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_scale(ctx, dst); -} - static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_clamp(ctx, dst); } diff --git a/ggml/src/ggml-sycl/scale.cpp b/ggml/src/ggml-sycl/scale.cpp new file mode 100644 index 0000000000000..c219526976524 --- /dev/null +++ b/ggml/src/ggml-sycl/scale.cpp @@ -0,0 +1,48 @@ +#include "scale.hpp" + +static void scale_f32(const float * x, float * dst, const float scale, const int k, const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + +static void scale_f32_sycl(const float * x, float * dst, const float scale, const int k, queue_ptr stream) { + const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { scale_f32(x, dst, scale, k, item_ct1); }); +} + +inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr main_stream = ctx.stream(); + + scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream); + /* + DPCT1010:87: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + SYCL_CHECK(0); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_scale(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/scale.hpp b/ggml/src/ggml-sycl/scale.hpp new file mode 100644 index 0000000000000..d079a1e5b7005 --- /dev/null +++ b/ggml/src/ggml-sycl/scale.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_SCALE_HPP +#define GGML_SYCL_SCALE_HPP + +#include "common.hpp" + +void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_SCALE_HPP From 0c319bf721b2792f1108c667526b58db44d259ea Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 11:48:03 +0530 Subject: [PATCH 20/40] DUP: move to cpy.cpp, set debug logs and adjust include --- ggml/src/ggml-sycl/cpy.cpp | 11 +++++++++++ ggml/src/ggml-sycl/cpy.hpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 5 ----- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 061fc06848b76..e6267dbf72680 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -1,5 +1,7 @@ #include "cpy.hpp" +#include + static void cpy_1_f32_f32(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; float * dsti = (float *) cdsti; @@ -350,6 +352,8 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co char * src0_ddc = (char *) src0->data; char * src1_ddc = (char *) src1->data; + GGML_SYCL_DEBUG("%s: type combination supplied: %s to %s\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, @@ -387,3 +391,10 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; std::exit(1); } + +void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + // TODO: why do we pass dst as src1 here? + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_cpy(ctx, dst->src[0], dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index 1fbc7b75cc4f8..0a0f561d2309a 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -2,10 +2,10 @@ #define GGML_SYCL_CPY_HPP #include "common.hpp" -#include typedef void (*cpy_kernel_t)(const char * cx, char * cdst); void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); +void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_CPY_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index bf1c14971810b..e6a4531f773e1 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2852,11 +2852,6 @@ static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_clamp(ctx, dst); } -static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - // TODO: why do we pass dst as src1 here? - ggml_sycl_cpy(ctx, dst->src[0], dst); -} - static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_pool2d(ctx, dst); } From ddc5e428f26a97f41f5b426d232be57d74277657 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 12:08:22 +0530 Subject: [PATCH 21/40] clamp: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/clamp.cpp | 51 ++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/clamp.hpp | 8 +++++ ggml/src/ggml-sycl/ggml-sycl.cpp | 51 -------------------------------- 4 files changed, 60 insertions(+), 51 deletions(-) create mode 100644 ggml/src/ggml-sycl/clamp.cpp create mode 100644 ggml/src/ggml-sycl/clamp.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 6923214d58b01..efe88fb20f745 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -36,6 +36,7 @@ #include "getrows.hpp" #include "diagmask.hpp" #include "scale.hpp" +#include "clamp.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/clamp.cpp b/ggml/src/ggml-sycl/clamp.cpp new file mode 100644 index 0000000000000..f1c20d3ca5f13 --- /dev/null +++ b/ggml/src/ggml-sycl/clamp.cpp @@ -0,0 +1,51 @@ +#include "clamp.hpp" + +static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, + const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); +} + +static void clamp_f32_sycl(const float * x, float * dst, const float min, const float max, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { clamp_f32(x, dst, min, max, k, item_ct1); }); +} + +inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + const dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream); + /* + DPCT1010:88: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + SYCL_CHECK(0); + */ +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_clamp(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/clamp.hpp b/ggml/src/ggml-sycl/clamp.hpp new file mode 100644 index 0000000000000..fdfbff55b553c --- /dev/null +++ b/ggml/src/ggml-sycl/clamp.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_CLAMP_HPP +#define GGML_SYCL_CLAMP_HPP + +#include "common.hpp" + +void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CLAMP_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index e6a4531f773e1..3773187fc8ed9 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1463,18 +1463,6 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } } -static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); -} - template static void pool2d_nchw_kernel( const int ih, const int iw, const int oh, const int ow, @@ -1600,19 +1588,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } -static void clamp_f32_sycl(const float *x, float *dst, const float min, - const float max, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - clamp_f32(x, dst, min, max, k, item_ct1); - }); -} - static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, const int nrows, queue_ptr stream) { const sycl::range<3> block_dims(1, 1, WARP_SIZE); @@ -1905,28 +1880,6 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); } -inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - float min; - float max; - memcpy(&min, dst->op_params, sizeof(float)); - memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); - const dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream); - /* - DPCT1010:88: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - SYCL_CHECK(0); -} - static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -2848,10 +2801,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_clamp(ctx, dst); -} - static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_pool2d(ctx, dst); } From ba79258a2bfe8186d2693c92149385e98c0f3d75 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 12:16:36 +0530 Subject: [PATCH 22/40] Add spaces to end of files --- ggml/src/ggml-sycl/diagmask.cpp | 2 +- ggml/src/ggml-sycl/diagmask.hpp | 2 +- ggml/src/ggml-sycl/getrows.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/diagmask.cpp b/ggml/src/ggml-sycl/diagmask.cpp index 821c8c69958e8..72184d845929d 100644 --- a/ggml/src/ggml-sycl/diagmask.cpp +++ b/ggml/src/ggml-sycl/diagmask.cpp @@ -50,4 +50,4 @@ void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_diag_mask_inf(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); -} \ No newline at end of file +} diff --git a/ggml/src/ggml-sycl/diagmask.hpp b/ggml/src/ggml-sycl/diagmask.hpp index 37954aedca75a..8a42a4e0899ba 100644 --- a/ggml/src/ggml-sycl/diagmask.hpp +++ b/ggml/src/ggml-sycl/diagmask.hpp @@ -5,4 +5,4 @@ void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -#endif // GGML_SYCL_DIAG_MASK \ No newline at end of file +#endif // GGML_SYCL_DIAG_MASK diff --git a/ggml/src/ggml-sycl/getrows.hpp b/ggml/src/ggml-sycl/getrows.hpp index 285c56af32a0f..7060b04d46923 100644 --- a/ggml/src/ggml-sycl/getrows.hpp +++ b/ggml/src/ggml-sycl/getrows.hpp @@ -5,4 +5,4 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -#endif // GGML_SYCL_GETROWS_HPP \ No newline at end of file +#endif // GGML_SYCL_GETROWS_HPP From 4db56d6ed220f576c7bffb20eed9c29b0aff356c Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 17:37:48 +0530 Subject: [PATCH 23/40] im2col: add try catch block and move wrapper function from ggml-sycl.cpp --- ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ---- ggml/src/ggml-sycl/im2col.cpp | 11 ++++++++++- ggml/src/ggml-sycl/im2col.hpp | 2 +- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 3773187fc8ed9..0992a1b586010 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2805,10 +2805,6 @@ static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_pool2d(ctx, dst); } -static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_im2col(ctx, dst); -} - static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); ggml_sycl_op_sum(ctx, dst); diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index d6b998b7e98a2..8b7ed4ca05210 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -82,7 +82,7 @@ static void im2col_sycl( } } -void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); @@ -122,4 +122,13 @@ void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { float * dst_dd = static_cast(dst->data); im2col_sycl(src1_dd, dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_im2col(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/im2col.hpp b/ggml/src/ggml-sycl/im2col.hpp index 4474c7b7b9157..9d51738f2c72a 100644 --- a/ggml/src/ggml-sycl/im2col.hpp +++ b/ggml/src/ggml-sycl/im2col.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_IM2COL_HPP From eb466d733a26996f25520a916b9773efb63a99f4 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 17:49:19 +0530 Subject: [PATCH 24/40] pool2d: move to a separate file --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 104 ---------------------------- ggml/src/ggml-sycl/pool2d.cpp | 114 +++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/pool2d.hpp | 8 +++ 4 files changed, 123 insertions(+), 104 deletions(-) create mode 100644 ggml/src/ggml-sycl/pool2d.cpp create mode 100644 ggml/src/ggml-sycl/pool2d.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index efe88fb20f745..0b1019386ac44 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -37,6 +37,7 @@ #include "diagmask.hpp" #include "scale.hpp" #include "clamp.hpp" +#include "pool2d.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0992a1b586010..346a322604ee9 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1463,67 +1463,7 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } } -template -static void pool2d_nchw_kernel( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const Ti* src, To* dst, const enum ggml_op_pool op, - const sycl::nd_item<3> &item_ct1) { - int idx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (idx >= parallel_elements) { - return; - } - - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = sycl::max(0, start_h); - const int eh = sycl::min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = sycl::max(0, start_w); - const int ew = sycl::min(iw, start_w + kw); - - To res = 0; - - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { -#if DPCT_COMPATIBILITY_TEMP >= 350 - /* - DPCT1098:106: The '*' expression is used instead of the __ldg - call. These two expressions do not provide the exact same - functionality. Check the generated code for potential precision - and/or performance issues. - */ - Ti cur = *(i_ptr + i * iw + j); -#else - Ti cur = i_ptr[i * iw + j]; -#endif - switch (op) { - case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; - case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - } - } - o_ptr[cur_oh * ow + cur_ow] = res; -} static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, const int ky, const int kx_padded, @@ -1812,46 +1752,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - const int64_t IH = dst->src[0]->ne[1]; - const int64_t IW = dst->src[0]->ne[0]; - - const int64_t N = dst->ne[3]; - const int64_t OC = dst->ne[2]; - const int64_t OH = dst->ne[1]; - const int64_t OW = dst->ne[0]; - - const int parallel_elements = N * OC * OH * OW; - const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; - dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - sycl::range<3> block_nums(1, 1, num_blocks); - main_stream->parallel_for( - sycl::nd_range<3>(block_nums * - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, - parallel_elements, src0_dd, dst_dd, op, - item_ct1); - }); -} - inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -2801,10 +2701,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_pool2d(ctx, dst); -} - static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); ggml_sycl_op_sum(ctx, dst); diff --git a/ggml/src/ggml-sycl/pool2d.cpp b/ggml/src/ggml-sycl/pool2d.cpp new file mode 100644 index 0000000000000..dd11ee6b5b61b --- /dev/null +++ b/ggml/src/ggml-sycl/pool2d.cpp @@ -0,0 +1,114 @@ +#include "pool2d.hpp" +#include + +template +static void pool2d_nchw_kernel(const int ih, const int iw, const int oh, const int ow, const int kh, const int kw, + const int sh, const int sw, const int ph, const int pw, const int parallel_elements, + const Ti * src, To * dst, const enum ggml_op_pool op, + const sycl::nd_item<3> & item_ct1) { + int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti * i_ptr = src + nc * I_HW; + To * o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = sycl::max(0, start_h); + const int eh = sycl::min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = sycl::max(0, start_w); + const int ew = sycl::min(iw, start_w + kw); + + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: + res = 0; + break; + case GGML_OP_POOL_MAX: + res = -FLT_MAX; + break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { +#if DPCT_COMPATIBILITY_TEMP >= 350 + /* + DPCT1098:106: The '*' expression is used instead of the __ldg + call. These two expressions do not provide the exact same + functionality. Check the generated code for potential precision + and/or performance issues. + */ + Ti cur = *(i_ptr + i * iw + j); +#else + Ti cur = i_ptr[i * iw + j]; +#endif + switch (op) { + case GGML_OP_POOL_AVG: + res += (cur / (kh * kw)); + break; + case GGML_OP_POOL_MAX: + res = sycl::max(res, (To) cur); + break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + +static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + const int32_t * opts = (const int32_t *) dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = dst->src[0]->ne[1]; + const int64_t IW = dst->src[0]->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + sycl::range<3> block_nums(1, 1, num_blocks); + main_stream->parallel_for(sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, + dst_dd, op, item_ct1); + }); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_pool2d(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/pool2d.hpp b/ggml/src/ggml-sycl/pool2d.hpp new file mode 100644 index 0000000000000..6b2ce8043c951 --- /dev/null +++ b/ggml/src/ggml-sycl/pool2d.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_POOL2D_HPP +#define GGML_SYCL_POOL2D_HPP + +#include "common.hpp" + +void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_POOL2D_HPP From 5c05a3eedc33d28aefef8424137f3351fc53e318 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 18:16:41 +0530 Subject: [PATCH 25/40] Move sum and sum rows to a separate file --- ggml/src/ggml-sycl/ggml-sycl.cpp | 49 ---------------------- ggml/src/ggml-sycl/sum.cpp | 72 ++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/sum.hpp | 9 ++++ 3 files changed, 81 insertions(+), 49 deletions(-) create mode 100644 ggml/src/ggml-sycl/sum.cpp create mode 100644 ggml/src/ggml-sycl/sum.hpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 346a322604ee9..451bb2bae8a92 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1528,17 +1528,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } -static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, - const int nrows, queue_ptr stream) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - const sycl::range<3> block_nums(1, nrows, 1); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - k_sum_rows_f32(x, dst, ncols, item_ct1); - }); -} - static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, const struct ggml_tensor *src, int64_t i3, int64_t i2, @@ -1752,34 +1741,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - const int64_t ne = ggml_nelements(dst->src[0]); - dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); -} - -inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - dpct::queue_ptr main_stream = ctx.stream(); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); -} - static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -2701,16 +2662,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_sum(ctx, dst); -} - -static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_sum_rows(ctx, dst); -} - void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == static_cast (main_device)) { return; diff --git a/ggml/src/ggml-sycl/sum.cpp b/ggml/src/ggml-sycl/sum.cpp new file mode 100644 index 0000000000000..be94b17845fa9 --- /dev/null +++ b/ggml/src/ggml-sycl/sum.cpp @@ -0,0 +1,72 @@ +#include "sum.hpp" + +static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> & item_ct1) { + const int row = item_ct1.get_group(1); + const int col = item_ct1.get_local_id(2); + + float sum = 0.0f; + for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum, item_ct1); + + if (col == 0) { + dst[row] = sum; + } +} + +static void sum_rows_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { k_sum_rows_f32(x, dst, ncols, item_ct1); }); +} + +inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + const int64_t ne = ggml_nelements(dst->src[0]); + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + dpct::queue_ptr main_stream = ctx.stream(); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_sum(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_sum_rows(ctx, dst); + GML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/sum.hpp b/ggml/src/ggml-sycl/sum.hpp new file mode 100644 index 0000000000000..d1b8e5a7c468e --- /dev/null +++ b/ggml/src/ggml-sycl/sum.hpp @@ -0,0 +1,9 @@ +#ifndef GGML_SYCL_SUM_HPP +#define GGML_SYCL_SUM_HPP + +#include "common.hpp" + +void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_SUM_HPP From d31c62d758c2ca42c957a5c5ff74817a271a2eda Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 18:20:44 +0530 Subject: [PATCH 26/40] norm: add try catch sycl exception --- ggml/src/ggml-sycl/ggml-sycl.cpp | 19 ------------------ ggml/src/ggml-sycl/norm.cpp | 33 +++++++++++++++++++++++++++++--- ggml/src/ggml-sycl/norm.hpp | 8 +++----- 3 files changed, 33 insertions(+), 27 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 451bb2bae8a92..9a4c7279c55b4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2106,25 +2106,6 @@ catch (sycl::exception const &exc) { std::exit(1); } - -static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_norm(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_rms_norm(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_group_norm(ctx, dst); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 8b096b998df3f..3c42f351c6ea8 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -311,7 +311,7 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, } } -void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -328,9 +328,12 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { dpct::queue_ptr main_stream = ctx.stream(); norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { +static void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -346,9 +349,12 @@ void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, dst->src[0]->ne[0] * dst->src[0]->ne[1] * dst->src[0]->ne[2], main_stream, ctx.device); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -364,4 +370,25 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { dpct::queue_ptr main_stream = ctx.stream(); rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } + +void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_norm(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_rms_norm(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_group_norm(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} \ No newline at end of file diff --git a/ggml/src/ggml-sycl/norm.hpp b/ggml/src/ggml-sycl/norm.hpp index e733de5c23c81..a227a9bc0f3f7 100644 --- a/ggml/src/ggml-sycl/norm.hpp +++ b/ggml/src/ggml-sycl/norm.hpp @@ -15,10 +15,8 @@ #include "common.hpp" -void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor * dst); - -void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, ggml_tensor * dst); - -void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor * dst); +void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_NORM_HPP From 1ccfaaedbb262042e04524e7fdf7f90ddf2403bc Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 18:24:45 +0530 Subject: [PATCH 27/40] Add sum to backend hpp --- ggml/src/ggml-sycl/backend.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 0b1019386ac44..965bde36ebf8f 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -38,6 +38,7 @@ #include "scale.hpp" #include "clamp.hpp" #include "pool2d.hpp" +#include "sum.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP From bba4b66a81bbff9d7649c732e3ab2e294113f531 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 18:34:04 +0530 Subject: [PATCH 28/40] concat: Handle SYCL exceptions --- ggml/src/ggml-sycl/concat.cpp | 11 ++++++++++- ggml/src/ggml-sycl/concat.hpp | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index d41cfd3a6ec88..6485df22d3a86 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -158,7 +158,7 @@ static void concat_f32_sycl_non_cont( }); } -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +static void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; queue_ptr stream = ctx.stream(); @@ -194,4 +194,13 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } + +void ggml_sycl_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_concat(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} \ No newline at end of file diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp index e5cb7314c9f33..8a1c5596e3654 100644 --- a/ggml/src/ggml-sycl/concat.hpp +++ b/ggml/src/ggml-sycl/concat.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst); +void ggml_sycl_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_CONCAT_HPP From 6dbb7ac827e844c9353b59b1f916dcebdc1cf23a Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 18:40:01 +0530 Subject: [PATCH 29/40] softmax: handle SYCL exceptions and add debug logs --- ggml/src/ggml-sycl/ggml-sycl.cpp | 4 ++-- ggml/src/ggml-sycl/softmax.cpp | 17 +++++++++++++++-- ggml/src/ggml-sycl/softmax.hpp | 2 +- ggml/src/ggml-sycl/sum.cpp | 6 +++--- 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 9a4c7279c55b4..6303444aa1875 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2752,7 +2752,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_group_norm(ctx, dst); break; case GGML_OP_CONCAT: - ggml_sycl_op_concat(ctx, dst); + ggml_sycl_concat(ctx, dst); break; case GGML_OP_UPSCALE: ggml_sycl_upscale(ctx, dst); @@ -2817,7 +2817,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_diag_mask_inf(ctx, dst); break; case GGML_OP_SOFT_MAX: - ggml_sycl_op_soft_max(ctx, dst); + ggml_sycl_softmax(ctx, dst); break; case GGML_OP_ROPE: ggml_sycl_rope(ctx, dst); diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index 563e0655f5527..2412076c423b9 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -224,7 +224,7 @@ static void soft_max_f32_sycl(const float * x, const T * mask, } } -void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -249,13 +249,26 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) { const sycl::half * src1_dd = static_cast(dst->src[1]->data); + GGML_SYCL_DEBUG("%s: Mask precision: F16\n", __func__); soft_max_f32_sycl(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) { const float * src1_dd = static_cast(dst->src[1]->data); + GGML_SYCL_DEBUG("%s: Mask precision: F32\n", __func__); soft_max_f32_sycl(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); } else { /* mask unavailable */ - soft_max_f32_sycl(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); + GGML_SYCL_DEBUG("%s: No mask supplied\n", __func__); + soft_max_f32_sycl(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, + ctx.device); } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_softmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_soft_max(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/softmax.hpp b/ggml/src/ggml-sycl/softmax.hpp index 2cf8582ec92e9..0c12a530dfe1a 100644 --- a/ggml/src/ggml-sycl/softmax.hpp +++ b/ggml/src/ggml-sycl/softmax.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst); +void ggml_sycl_softmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_SOFTMAX_HPP diff --git a/ggml/src/ggml-sycl/sum.cpp b/ggml/src/ggml-sycl/sum.cpp index be94b17845fa9..67cfc4b1551e6 100644 --- a/ggml/src/ggml-sycl/sum.cpp +++ b/ggml/src/ggml-sycl/sum.cpp @@ -27,7 +27,7 @@ static void sum_rows_f32_sycl(const float * x, float * dst, const int ncols, con inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const int64_t ne = ggml_nelements(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); @@ -43,7 +43,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); @@ -68,5 +68,5 @@ void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(dst->src[0])); GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_sum_rows(ctx, dst); - GML_SYCL_DEBUG("call %s done\n", __func__); + GGML_SYCL_DEBUG("call %s done\n", __func__); } From a6a239cf39c804e39d00b89d929ac92e8f24e4c4 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 19:03:29 +0530 Subject: [PATCH 30/40] norm: add a space at the end of file --- ggml/src/ggml-sycl/norm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 3c42f351c6ea8..0a7411a31e288 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -391,4 +391,4 @@ void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_group_norm(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); -} \ No newline at end of file +} From 6eb30d9403b3988a29f5f732392af47e474058c6 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Sun, 2 Feb 2025 19:09:23 +0530 Subject: [PATCH 31/40] Adjust EOF spaces and usused variable --- ggml/src/ggml-sycl/concat.cpp | 2 +- ggml/src/ggml-sycl/conv.cpp | 1 - ggml/src/ggml-sycl/dmmv.cpp | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index 6485df22d3a86..e6fdd79b1e0a2 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -203,4 +203,4 @@ void ggml_sycl_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_concat(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); -} \ No newline at end of file +} diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index ddba601e10fcc..bce7fdd791e2f 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -97,4 +97,3 @@ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor src1->ne[0], dst->ne[0], src0_d, src1_d, dst_d, stream); } - diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 0d097357ce79b..224854307337e 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -973,6 +973,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( } #else const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion + GGML_UNUSED(ctx); #endif // GGML_SYCL_F16 switch (src0->type) { From 539b0c662ee9eaa40e56d24bddc3a23ef6e1759b Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 10:01:07 +0530 Subject: [PATCH 32/40] ggml-sycl: sort includes --- ggml/include/ggml-sycl.h | 3 --- ggml/src/ggml-sycl/common.hpp | 7 ++++++- ggml/src/ggml-sycl/ggml-sycl.cpp | 8 +------- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 5ce349a880edc..6b0bebf9c70c9 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -9,9 +9,6 @@ #include "ggml.h" #include "ggml-backend.h" -#define GGML_SYCL_NAME "SYCL" -#define GGML_SYCL_MAX_DEVICES 48 - #ifdef __cplusplus extern "C" { #endif diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 38f5cda7297f3..0eb291ecc3b35 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -17,7 +17,6 @@ #include #include "dpct/helper.hpp" -#include "ggml-sycl.h" #include "presets.hpp" #if GGML_SYCL_DNNL #include "dnnl.hpp" @@ -31,6 +30,9 @@ #pragma clang diagnostic ignored "-Wnested-anon-types" #include "ggml-common.h" #pragma clang diagnostic pop +#include +#include + #include "ggml-backend-impl.h" #include "ggml-impl.h" #include "ggml.h" @@ -88,6 +90,9 @@ extern int g_ggml_sycl_debug; #define GGML_SYCL_MMV_Y 1 #endif +#define GGML_SYCL_NAME "SYCL" +#define GGML_SYCL_MAX_DEVICES 48 + typedef sycl::queue *queue_ptr; enum ggml_sycl_backend_gpu_mode { diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 6303444aa1875..b0d0818622915 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -29,15 +29,9 @@ #include #include -#include -#include - #include "ggml-sycl.h" -#include "ggml-impl.h" -#include "ggml-backend-impl.h" - +#include "common.hpp" #include "ggml-sycl/backend.hpp" -#include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" static bool g_sycl_loaded = false; From 18d706ab0e52898d16a5d190a2b9d61b30ebbbb8 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 10:38:56 +0530 Subject: [PATCH 33/40] gemm.hpp: remove unused include --- ggml/src/ggml-sycl/gemm.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index 3f0f34ad603f5..a43099680a4be 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -16,8 +16,6 @@ #include #include -#include "ggml-sycl.h" - #if GGML_SYCL_DNNL #include "dnnl.hpp" From 0ae9a07cf8429529fe1e3613692666ed4540c4be Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 11:15:43 +0530 Subject: [PATCH 34/40] ggml_sycl_op_argmax)Add debug logs to ggml_sycl_mul_ma0 --- ggml/src/ggml-sycl/ggml-sycl.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b0d0818622915..fd3583539e52d 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2380,25 +2380,41 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor if (src0->ne[3] == 1 && src1->ne[3] == 1) { // KQ single-batch // mmv p021 was specific for these dimensions + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_p021\n", __func__); ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_p021 done\n", __func__); } else { // The kernel from the if path is faster for that specific case, but does not support all mul mats. + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl\n", __func__); ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl done\n", __func__); } } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_nc\n", __func__); ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_nc done\n", __func__); } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl\n", __func__); ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl done\n", __func__); } else if (use_dequantize_mul_mat_vec) { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_dequantize_mul_mat_vec\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_dequantize_mul_mat_vec done\n", __func__); } else if (use_mul_mat_vec_q) { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_vec_q\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_vec_q done\n", __func__); } else if (use_mul_mat_q) { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_q\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_q done\n", __func__); } else { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_sycl\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_sycl done\n", __func__); } } From 7369e54b33c47779e8cdef423829b0afd3d8081f Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 11:53:22 +0530 Subject: [PATCH 35/40] Add back ggml_sycl_set_device to kernels --- ggml/src/ggml-sycl/argmax.cpp | 1 + ggml/src/ggml-sycl/argsort.cpp | 1 + ggml/src/ggml-sycl/binbcast.cpp | 5 +++++ ggml/src/ggml-sycl/clamp.cpp | 1 + ggml/src/ggml-sycl/concat.cpp | 1 + ggml/src/ggml-sycl/conv.cpp | 1 + ggml/src/ggml-sycl/diagmask.cpp | 1 + ggml/src/ggml-sycl/element_wise.cpp | 20 ++++++++++++++++++++ ggml/src/ggml-sycl/getrows.cpp | 2 ++ ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- ggml/src/ggml-sycl/gla.cpp | 1 + ggml/src/ggml-sycl/im2col.cpp | 1 + ggml/src/ggml-sycl/norm.cpp | 3 +++ ggml/src/ggml-sycl/outprod.cpp | 2 ++ ggml/src/ggml-sycl/pool2d.cpp | 1 + ggml/src/ggml-sycl/rope.cpp | 1 + ggml/src/ggml-sycl/scale.cpp | 1 + ggml/src/ggml-sycl/softmax.cpp | 2 +- ggml/src/ggml-sycl/sum.cpp | 2 ++ ggml/src/ggml-sycl/wkv6.cpp | 1 + 20 files changed, 48 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp index 76bc6f28ca7b1..502f82840b128 100644 --- a/ggml/src/ggml-sycl/argmax.cpp +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -58,6 +58,7 @@ static void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * d const int64_t nrows = ggml_nrows(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); int32_t * dst_dd = static_cast(dst->data); argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp index 9c88d7323cec0..4557599db49b4 100644 --- a/ggml/src/ggml-sycl/argsort.cpp +++ b/ggml/src/ggml-sycl/argsort.cpp @@ -111,6 +111,7 @@ inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); int32_t * dst_dd = static_cast(dst->data); diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index b94b82e799b81..0d30150246240 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -237,6 +237,7 @@ inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); @@ -250,6 +251,7 @@ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); @@ -263,6 +265,7 @@ inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); @@ -276,6 +279,7 @@ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, main_stream); @@ -288,6 +292,7 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * d const void * src0_d = static_cast(dst->src[0]->data); void * dst_d = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); ggml_sycl_op_bin_bcast>(dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); } catch (const sycl::exception & exc) { diff --git a/ggml/src/ggml-sycl/clamp.cpp b/ggml/src/ggml-sycl/clamp.cpp index f1c20d3ca5f13..8ffee729cdfae 100644 --- a/ggml/src/ggml-sycl/clamp.cpp +++ b/ggml/src/ggml-sycl/clamp.cpp @@ -30,6 +30,7 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds memcpy(&min, dst->op_params, sizeof(float)); memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index e6fdd79b1e0a2..fa44cd7b6de84 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -162,6 +162,7 @@ static void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor * d const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const int32_t dim = ((int32_t *)dst->op_params)[0]; diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index bce7fdd791e2f..ef310859122f2 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -79,6 +79,7 @@ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor float * dst_d = (float *)dst->data; dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-sycl/diagmask.cpp b/ggml/src/ggml-sycl/diagmask.cpp index 72184d845929d..8d61463f2d3bf 100644 --- a/ggml/src/ggml-sycl/diagmask.cpp +++ b/ggml/src/ggml-sycl/diagmask.cpp @@ -37,6 +37,7 @@ inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_ten const int n_past = ((int32_t *) dst->op_params)[0]; dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 70a6de470308c..12890cd83a3d8 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -514,6 +514,7 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -526,6 +527,7 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -538,6 +540,7 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -551,6 +554,7 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); @@ -562,6 +566,7 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); @@ -573,6 +578,7 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); @@ -585,6 +591,7 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -597,6 +604,7 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); @@ -608,6 +616,7 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor *dst) GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -620,6 +629,7 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *d GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -632,6 +642,7 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); @@ -643,6 +654,7 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -655,6 +667,7 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -669,6 +682,7 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); step_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } @@ -681,6 +695,7 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor *dst) const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } @@ -697,6 +712,7 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream); } @@ -709,6 +725,7 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); } @@ -727,6 +744,7 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); upscale_f32_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, @@ -743,6 +761,7 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); pad_f32_sycl(src0_dd, dst_dd, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], @@ -760,6 +779,7 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); const float * src1_dd = static_cast(dst->src[1]->data); float * dst_dd = static_cast(dst->data); diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index 501c1f7a6a646..1833c80a82bf3 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -84,6 +84,7 @@ static void get_rows_sycl(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { float * dst_dd = static_cast(dst->data); dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_get_rows(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, @@ -113,6 +114,7 @@ template static void get_rows_sycl_float(ggml_backend_sycl_con float * dst_dd = static_cast(dst->data); dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); { dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index fd3583539e52d..ea8a98e879608 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3081,7 +3081,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { } int ggml_backend_sycl_get_device_count() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); + // GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count:\n"); return ggml_sycl_info().device_count; } diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp index eedb47486430a..630db5ba6868c 100644 --- a/ggml/src/ggml-sycl/gla.cpp +++ b/ggml/src/ggml-sycl/gla.cpp @@ -88,6 +88,7 @@ void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor const int64_t H = dst->src[0]->ne[1]; dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32); GGML_ASSERT(C % H == 0); GGML_ASSERT(C / H == 64 || C / H == 128); diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index 8b7ed4ca05210..834aec5a85056 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -112,6 +112,7 @@ static void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * d const int64_t batch = dst->src[1]->ne[3]; const size_t batch_offset = dst->src[1]->nb[3] / 4; // nb is byte offset, src is type float32 dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); if (dst->type == GGML_TYPE_F16) { const float * src1_dd = static_cast(dst->src[1]->data); diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 0a7411a31e288..54e9ca5583331 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -326,6 +326,7 @@ static void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); } catch (const sycl::exception & exc) { @@ -348,6 +349,7 @@ static void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, dst->src[0]->ne[0] * dst->src[0]->ne[1] * dst->src[0]->ne[2], main_stream, ctx.device); } catch (const sycl::exception & exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; @@ -368,6 +370,7 @@ static void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); } catch (const sycl::exception & exc) { diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index 8e8347ff4f95e..27c3adca4e975 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -17,6 +17,8 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { // Get SYCL queue dpct::queue_ptr stream = ctx.stream(); + // set device + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); // Dimension checks GGML_ASSERT(ne01 == ne11); // Inner dimensions must match diff --git a/ggml/src/ggml-sycl/pool2d.cpp b/ggml/src/ggml-sycl/pool2d.cpp index dd11ee6b5b61b..ee67307ca4d33 100644 --- a/ggml/src/ggml-sycl/pool2d.cpp +++ b/ggml/src/ggml-sycl/pool2d.cpp @@ -93,6 +93,7 @@ static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * d const int parallel_elements = N * OC * OH * OW; const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); sycl::range<3> block_nums(1, 1, num_blocks); diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 2a8a7c778da08..b7f03222f9841 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -236,6 +236,7 @@ static void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst rope_corr_dims corr_dims; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); // compute if (is_neox) { diff --git a/ggml/src/ggml-sycl/scale.cpp b/ggml/src/ggml-sycl/scale.cpp index c219526976524..f37f852cf0607 100644 --- a/ggml/src/ggml-sycl/scale.cpp +++ b/ggml/src/ggml-sycl/scale.cpp @@ -29,6 +29,7 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * ds float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream); /* diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index 2412076c423b9..018fed5a956ed 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -244,7 +244,7 @@ static void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); - ggml_sycl_set_device(ctx.device); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); dpct::queue_ptr main_stream = ctx.stream(); if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) { diff --git a/ggml/src/ggml-sycl/sum.cpp b/ggml/src/ggml-sycl/sum.cpp index 67cfc4b1551e6..66d3c8f6d6f8b 100644 --- a/ggml/src/ggml-sycl/sum.cpp +++ b/ggml/src/ggml-sycl/sum.cpp @@ -31,6 +31,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) const int64_t ne = ggml_nelements(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -48,6 +49,7 @@ inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp index e3ea568c5f5e7..9c20135fd4dc9 100644 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -115,6 +115,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64 dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); // Calculate execution configuration const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td From e5926374a5aa186965744a52cc58004eba4db86f Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 18:44:49 +0530 Subject: [PATCH 36/40] Add remaining SYCL exception handler to kernel and refactor --- ggml/src/ggml-sycl/argmax.cpp | 2 +- ggml/src/ggml-sycl/argsort.cpp | 1 + ggml/src/ggml-sycl/binbcast.cpp | 9 ++ ggml/src/ggml-sycl/clamp.cpp | 2 +- ggml/src/ggml-sycl/common.cpp | 10 ++ ggml/src/ggml-sycl/common.hpp | 2 + ggml/src/ggml-sycl/concat.cpp | 56 +++++------ ggml/src/ggml-sycl/conv.cpp | 13 ++- ggml/src/ggml-sycl/conv.hpp | 2 +- ggml/src/ggml-sycl/cpy.cpp | 3 +- ggml/src/ggml-sycl/diagmask.cpp | 2 +- ggml/src/ggml-sycl/element_wise.cpp | 142 ++++++++++++++++++++-------- ggml/src/ggml-sycl/getrows.cpp | 15 ++- ggml/src/ggml-sycl/getrows.hpp | 2 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 17 +--- ggml/src/ggml-sycl/gla.cpp | 11 ++- ggml/src/ggml-sycl/gla.hpp | 2 +- ggml/src/ggml-sycl/im2col.cpp | 2 +- ggml/src/ggml-sycl/norm.cpp | 6 +- ggml/src/ggml-sycl/outprod.cpp | 2 + ggml/src/ggml-sycl/pool2d.cpp | 2 +- ggml/src/ggml-sycl/rope.cpp | 4 +- ggml/src/ggml-sycl/scale.cpp | 2 +- ggml/src/ggml-sycl/softmax.cpp | 1 + ggml/src/ggml-sycl/sum.cpp | 4 +- ggml/src/ggml-sycl/tsembd.cpp | 8 +- ggml/src/ggml-sycl/wkv6.cpp | 7 +- 27 files changed, 221 insertions(+), 108 deletions(-) diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp index 502f82840b128..b7d0471f8b06c 100644 --- a/ggml/src/ggml-sycl/argmax.cpp +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -52,7 +52,7 @@ static void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * d GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_I32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp index 4557599db49b4..282ee6cc6273b 100644 --- a/ggml/src/ggml-sycl/argsort.cpp +++ b/ggml/src/ggml-sycl/argsort.cpp @@ -105,6 +105,7 @@ static void argsort_f32_i32_sycl(const float * x, int * dst, const int ncols, co inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_I32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index 0d30150246240..9d9b1ab027275 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -233,6 +233,8 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor * src0, const ggml_tensor * } inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -247,6 +249,8 @@ inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -261,6 +265,8 @@ inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -275,6 +281,8 @@ inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const void * src0_dd = static_cast(dst->src[0]->data); const void * src1_dd = static_cast(dst->src[1]->data); void * dst_dd = static_cast(dst->data); @@ -289,6 +297,7 @@ inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) } inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const void * src0_d = static_cast(dst->src[0]->data); void * dst_d = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); diff --git a/ggml/src/ggml-sycl/clamp.cpp b/ggml/src/ggml-sycl/clamp.cpp index 8ffee729cdfae..35eb8deca480d 100644 --- a/ggml/src/ggml-sycl/clamp.cpp +++ b/ggml/src/ggml-sycl/clamp.cpp @@ -23,7 +23,7 @@ static void clamp_f32_sycl(const float * x, float * dst, const float min, const inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); float min; float max; diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 3cdc762236d9c..e425f5ec6e618 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -52,6 +52,16 @@ bool gpu_has_xmx(sycl::device &dev) { return dev.has(sycl::aspect::ext_intel_matrix); } +const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_SYCL_NAME "_Split"; + + GGML_UNUSED(buft); +} + +bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { + return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name; +} + int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { const int64_t max_range = std::numeric_limits::max(); int64_t sycl_down_blk_size = block_size; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 0eb291ecc3b35..faf25dd9c198d 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -436,6 +436,8 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg const queue_ptr &main_stream); bool gpu_has_xmx(sycl::device &dev); +const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft); +bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer); // Some backend specific macros #define GGML_SYCL_TENSOR_BINARY_OP_LOCALS \ diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index fa44cd7b6de84..c69e109d86e9a 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -159,34 +159,34 @@ static void concat_f32_sycl_non_cont( } static void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - queue_ptr stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - const int32_t dim = ((int32_t *)dst->op_params)[0]; - - if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { - const float *src0_d = (const float *)src0->data; - const float *src1_d = (const float *)src1->data; - - float *dst_d = (float *)dst->data; - - if (dim != 3) { - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_sycl( - src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), - dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], - src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); - } - } else { - const size_t size0 = ggml_nbytes(src0); - const size_t size1 = ggml_nbytes(src1); - - SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); - SYCL_CHECK(CHECK_TRY_ERROR( - stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); - } + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + const int32_t dim = ((int32_t *) dst->op_params)[0]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + + float * dst_d = (float *) dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), + dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0], + dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); + } } else concat_f32_sycl_non_cont( stream, (const char *)src0->data, (const char *)src1->data, diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index ef310859122f2..578656a8b39b3 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -71,7 +71,9 @@ static void conv_transpose_1d_f32_f32_sycl( }); } -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +static void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; const float * src0_d = (const float *)src0->data; @@ -97,4 +99,13 @@ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor src0->ne[0], src0->ne[1], src0->ne[2], src1->ne[0], dst->ne[0], src0_d, src1_d, dst_d, stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } + +void ggml_sycl_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_conv_transpose_1d(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} \ No newline at end of file diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp index f9e60dc758029..d5d69b3f86cbd 100644 --- a/ggml/src/ggml-sycl/conv.hpp +++ b/ggml/src/ggml-sycl/conv.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst); +void ggml_sycl_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index e6267dbf72680..1559db5dc1372 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -339,13 +339,14 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co } void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer)); const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS; + GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS SYCL_CHECK(ggml_sycl_set_device(ctx.device)); queue_ptr main_stream = ctx.stream(); diff --git a/ggml/src/ggml-sycl/diagmask.cpp b/ggml/src/ggml-sycl/diagmask.cpp index 8d61463f2d3bf..1ec00194dda07 100644 --- a/ggml/src/ggml-sycl/diagmask.cpp +++ b/ggml/src/ggml-sycl/diagmask.cpp @@ -29,7 +29,7 @@ static void diag_mask_inf_f32_sycl(const float * x, float * dst, const int ncols inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t ne01 = dst->src[0]->ne[1]; diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 12890cd83a3d8..f1d9948447ba6 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -508,87 +508,105 @@ void pad_f32_sycl(const float *x, float *dst, const int ne00, }); } -inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); @@ -596,115 +614,142 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor float * dst_dd = static_cast(dst->data); hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); log_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); step_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); float negative_slope; memcpy(&negative_slope, dst->op_params, sizeof(float)); @@ -715,26 +760,32 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor SYCL_CHECK(ggml_sycl_set_device(ctx.device)); leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const float sf0 = (float)dst->ne[0]/dst->src[0]->ne[0]; const float sf1 = (float)dst->ne[1]/dst->src[0]->ne[1]; @@ -749,14 +800,17 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * upscale_f32_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); @@ -766,17 +820,20 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) pad_f32_sycl(src0_dd, dst_dd, dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, - ggml_tensor *dst) { + ggml_tensor *dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const dpct::queue_ptr main_stream = ctx.stream(); SYCL_CHECK(ggml_sycl_set_device(ctx.device)); @@ -790,6 +847,9 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, int offset = dst->op_params[3] / 4; // offset in bytes acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index 1833c80a82bf3..00fe37d8e1c51 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -126,15 +126,15 @@ template static void get_rows_sycl_float(ggml_backend_sycl_con } } -void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); switch (dst->src[0]->type) { case GGML_TYPE_F16: @@ -164,4 +164,13 @@ void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ABORT("fatal error"); break; } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_get_rows(ctx, dst); + GGML_SYCL_DEBUG("call %s\n", __func__); } diff --git a/ggml/src/ggml-sycl/getrows.hpp b/ggml/src/ggml-sycl/getrows.hpp index 7060b04d46923..7dc21b99242b1 100644 --- a/ggml/src/ggml-sycl/getrows.hpp +++ b/ggml/src/ggml-sycl/getrows.hpp @@ -3,6 +3,6 @@ #include "common.hpp" -void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_GETROWS_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index ea8a98e879608..88733dfa0d0fb 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -911,17 +911,6 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { }; // sycl split buffer type - -static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return GGML_SYCL_NAME "_Split"; - - GGML_UNUSED(buft); -} - -static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { - return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name; -} - static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor @@ -2686,13 +2675,13 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_argmax(ctx, dst); break; case GGML_OP_CONV_TRANSPOSE_1D: - ggml_sycl_op_conv_transpose_1d(ctx, dst); + ggml_sycl_conv_transpose_1d(ctx, dst); break; case GGML_OP_REPEAT: ggml_sycl_repeat(ctx, dst); break; case GGML_OP_GET_ROWS: - ggml_sycl_op_get_rows(ctx, dst); + ggml_sycl_get_rows(ctx, dst); break; case GGML_OP_DUP: ggml_sycl_dup(ctx, dst); @@ -2854,7 +2843,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_op_rwkv_wkv6(ctx, dst); break; case GGML_OP_GATED_LINEAR_ATTN: - ggml_sycl_op_gated_linear_attn(ctx, dst); + ggml_sycl_gated_linear_attn(ctx, dst); break; default: return false; diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp index 630db5ba6868c..dd01ec3f8ab07 100644 --- a/ggml/src/ggml-sycl/gla.cpp +++ b/ggml/src/ggml-sycl/gla.cpp @@ -75,7 +75,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, }); } -void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const float * k_d = static_cast(dst->src[0]->data); const float * v_d = static_cast(dst->src[1]->data); const float * r_d = static_cast(dst->src[2]->data); @@ -103,4 +103,13 @@ void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor } else { gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d); } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_gated_linear_attn(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/gla.hpp b/ggml/src/ggml-sycl/gla.hpp index 607cf3a7f3049..a7e3af79db190 100644 --- a/ggml/src/ggml-sycl/gla.hpp +++ b/ggml/src/ggml-sycl/gla.hpp @@ -3,6 +3,6 @@ #include "common.hpp" -void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_GLA_HPP diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index 834aec5a85056..6712290703f97 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -86,7 +86,7 @@ static void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * d GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 54e9ca5583331..1cf6e01f51abe 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -315,7 +315,7 @@ static void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); @@ -338,7 +338,7 @@ static void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); int num_groups = dst->op_params[0]; @@ -360,7 +360,7 @@ static void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index 27c3adca4e975..d21196a214db8 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -39,6 +39,7 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans; const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); + GGML_SYCL_DEBUG("call %s\n", __func__); try { // Perform matrix multiplication using oneMKL GEMM @@ -55,4 +56,5 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { std::cerr << exc.what() << std::endl; GGML_ASSERT(false); } + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/pool2d.cpp b/ggml/src/ggml-sycl/pool2d.cpp index ee67307ca4d33..b010e7d3a11e5 100644 --- a/ggml/src/ggml-sycl/pool2d.cpp +++ b/ggml/src/ggml-sycl/pool2d.cpp @@ -71,7 +71,7 @@ static void pool2d_nchw_kernel(const int ih, const int iw, const int oh, const i static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int32_t * opts = (const int32_t *) dst->op_params; enum ggml_op_pool op = static_cast(opts[0]); diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index b7f03222f9841..78fe1d0012078 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -197,8 +197,8 @@ static void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[0]->type == dst->type); - GGML_ASSERT(strcmp(dst->src[1]->buffer->buft->iface.get_name(dst->src[1]->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne00 = dst->src[0]->ne[0]; const int64_t ne01 = dst->src[0]->ne[1]; diff --git a/ggml/src/ggml-sycl/scale.cpp b/ggml/src/ggml-sycl/scale.cpp index f37f852cf0607..74c1178367ed3 100644 --- a/ggml/src/ggml-sycl/scale.cpp +++ b/ggml/src/ggml-sycl/scale.cpp @@ -21,7 +21,7 @@ static void scale_f32_sycl(const float * x, float * dst, const float scale, cons inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); float scale; memcpy(&scale, dst->op_params, sizeof(float)); diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index 018fed5a956ed..b2b73dc68d54a 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -228,6 +228,7 @@ static void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional diff --git a/ggml/src/ggml-sycl/sum.cpp b/ggml/src/ggml-sycl/sum.cpp index 66d3c8f6d6f8b..acec1493a4f63 100644 --- a/ggml/src/ggml-sycl/sum.cpp +++ b/ggml/src/ggml-sycl/sum.cpp @@ -27,7 +27,7 @@ static void sum_rows_f32_sycl(const float * x, float * dst, const int ncols, con inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ne = ggml_nelements(dst->src[0]); dpct::queue_ptr main_stream = ctx.stream(); @@ -44,7 +44,7 @@ inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const int64_t ncols = dst->src[0]->ne[0]; const int64_t nrows = ggml_nrows(dst->src[0]); diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index 9de324c3a14c4..a5aedd4d0e252 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -55,7 +55,7 @@ static void timestep_embedding_f32_sycl( }); } -void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const ggml_tensor *src0 = dst->src[0]; const float * src0_d = static_cast(src0->data); float * dst_d = static_cast(dst->data); @@ -66,6 +66,10 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso const int dim = dst->op_params[0]; const int max_period = dst->op_params[1]; - + GGML_SYCL_DEBUG("call %s\n", __func__); timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp index 9c20135fd4dc9..4c01f29b98a57 100644 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -95,7 +95,7 @@ static void rwkv_wkv_f32_kernel( } } -void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) try { const float* k_d = (const float*)dst->src[0]->data; const float* v_d = (const float*)dst->src[1]->data; @@ -121,6 +121,7 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td sycl::range<3> block_dims(1, 1, C / H); sycl::range<3> grid_dims(1, 1, B * H); + GGML_SYCL_DEBUG("call %s", __func__); // Submit kernel stream->submit([&](sycl::handler& cgh) { @@ -135,5 +136,9 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { ); }); }); + GGML_SYCL_DEBUG("call %s done", __func__); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } From 52b06526013e952bf8dbe6fa3cd3e1ee5a8c7a93 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 18:47:50 +0530 Subject: [PATCH 37/40] conv: add space before eof --- ggml/src/ggml-sycl/conv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index 578656a8b39b3..7d0bb730a19b4 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -108,4 +108,4 @@ void ggml_sycl_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_conv_transpose_1d(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); -} \ No newline at end of file +} From 0b602f0ecd52b83414e2a1c4478723f96cbbdad9 Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Mon, 3 Feb 2025 21:08:25 +0530 Subject: [PATCH 38/40] Final touches --- ggml/src/ggml-sycl/binbcast.cpp | 1 - ggml/src/ggml-sycl/ggml-sycl.cpp | 13 ++++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index 9d9b1ab027275..8fc6e1b56b22f 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -1,5 +1,4 @@ #include "binbcast.hpp" -#include "common.hpp" static __dpct_inline__ float op_repeat(const float a, const float b) { return b; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 88733dfa0d0fb..662fb27a9ec9a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -309,6 +309,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -333,6 +334,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -361,6 +363,7 @@ static bool ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); if (ggml_backend_buffer_is_sycl(src->buffer)) { ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context; @@ -418,7 +421,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; ggml_sycl_set_device(ctx->device); queue_ptr stream = ctx->stream; @@ -465,6 +469,7 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; ggml_sycl_set_device(buft_ctx->device); const queue_ptr stream = buft_ctx->stream; @@ -708,6 +713,7 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff static void ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; @@ -791,6 +797,7 @@ static void ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -844,6 +851,7 @@ static void ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -894,6 +902,7 @@ catch (sycl::exception const &exc) { } static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); GGML_UNUSED(buffer); GGML_UNUSED(value); } @@ -1017,10 +1026,12 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ } static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); ggml_sycl_host_free(buffer->context); } static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); void * ptr = ggml_sycl_host_malloc(size); if (ptr == nullptr) { From efb5773bc2293dc53f0997c7fb5bca193903944b Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 5 Feb 2025 09:01:25 +0530 Subject: [PATCH 39/40] ggml-sycl: hide matrix engine info for now from print sycl devices --- ggml/src/ggml-sycl/ggml-sycl.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 662fb27a9ec9a..5a38556f3d708 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -98,11 +98,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); auto global_mem_size = prop.get_global_mem_size()/1000000; - std::string xmx = gpu_has_xmx(device) ? "yes" : "no"; - GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(), + GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), name.c_str(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - global_mem_size, device.get_info().c_str(), xmx.c_str()); + global_mem_size, device.get_info().c_str()); } void ggml_backend_sycl_print_sycl_devices() { @@ -113,16 +112,16 @@ void ggml_backend_sycl_print_sycl_devices() { GGML_LOG_INFO( "| | | | " - " |Max | |Max |Global | | XMX |\n"); + " |Max | |Max |Global | |\n"); GGML_LOG_INFO( "| | | | " - " |compute|Max work|sub |mem | | or |\n"); + " |compute|Max work|sub |mem | |\n"); GGML_LOG_INFO( "|ID| Device Type| " - "Name|Version|units |group |group|size | Driver version| Tensor Cores |\n"); + "Name|Version|units |group |group|size | Driver version|\n"); GGML_LOG_INFO( "|--|-------------------|---------------------------------------|------" - "-|-------|--------|-----|-------|---------------------|--------------|\n"); + "-|-------|--------|-----|-------|---------------------|\n"); for (int id = 0; id < device_count; ++id) { sycl::device device = dpct::dev_mgr::instance().get_device(id); From cfa2cc1e403c156cf5f9c4429a2309702c6b841e Mon Sep 17 00:00:00 2001 From: Akarshan Biswas Date: Wed, 5 Feb 2025 13:33:46 +0530 Subject: [PATCH 40/40] Disable non-contiguous tensor support in norm kernels and add newline at the end of debug logs --- ggml/src/ggml-sycl/ggml-sycl.cpp | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 5a38556f3d708..ca92e966c3fb4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -33,6 +33,7 @@ #include "common.hpp" #include "ggml-sycl/backend.hpp" #include "ggml-sycl/gemm.hpp" +#include "ggml.h" static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; @@ -308,7 +309,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -333,7 +334,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -362,7 +363,7 @@ static bool ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); if (ggml_backend_buffer_is_sycl(src->buffer)) { ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context; @@ -420,7 +421,7 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; ggml_sycl_set_device(ctx->device); @@ -468,7 +469,7 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; ggml_sycl_set_device(buft_ctx->device); const queue_ptr stream = buft_ctx->stream; @@ -712,7 +713,7 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff static void ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; @@ -796,7 +797,7 @@ static void ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -850,7 +851,7 @@ static void ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -901,7 +902,7 @@ catch (sycl::exception const &exc) { } static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); GGML_UNUSED(buffer); GGML_UNUSED(value); } @@ -1025,7 +1026,7 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ } static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_sycl_host_free(buffer->context); } @@ -3277,14 +3278,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - case GGML_OP_NORM: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_LOG: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: + return true; + case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: + return ggml_is_contiguous(op->src[0]); case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SQRT: @@ -3316,7 +3320,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: - case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: