From efe270104af18c7e7936657ae7c643a222d5ca84 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Mon, 15 Sep 2025 14:32:30 +0300 Subject: [PATCH 1/3] SYCL/SET: implement operator + wire-up; docs/ops updates; element_wise & ggml-sycl changes --- docs/ops.md | 2 +- ggml/src/ggml-sycl/element_wise.cpp | 135 ++++++++++++++++++++++++++++ ggml/src/ggml-sycl/element_wise.hpp | 2 + ggml/src/ggml-sycl/ggml-sycl.cpp | 22 +++++ 4 files changed, 160 insertions(+), 1 deletion(-) diff --git a/docs/ops.md b/docs/ops.md index 0047ef3fa5e53..89ddc86a5dc10 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -85,7 +85,7 @@ Legend: | RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | -| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | +| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | | SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | | SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index c2da2fb48ad28..58c72beb7c34e 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -2,6 +2,7 @@ #include "ggml-sycl/presets.hpp" #include "ggml.h" #include "element_wise.hpp" +#include #define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) @@ -926,6 +927,135 @@ static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor ggml_sycl_detail::pad_sycl(src, dst_ptr, ne00, ne01, ne02, ne0, ne1, ne2, stream); }); } +static inline void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(dst->src[1] != nullptr); + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(src1->type == dst->type); +#if defined(GGML_SYCL_F16) + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_I32); +#else + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32); +#endif + const size_t ts = ggml_type_size(dst->type); + + dpct::queue_ptr q = ctx.stream(); + { + const bool same_type = (src0->type == dst->type); + const bool src_cont = ggml_is_contiguous(src0); + const bool dst_cont = ggml_is_contiguous(dst); + + const void *p_src0 = src0->data; + void *p_dst = dst->data; + + auto pt_src0 = sycl::get_pointer_type((const char*)p_src0, q->get_context()); + auto pt_dst = sycl::get_pointer_type((char*)p_dst, q->get_context()); + + if (same_type && src_cont && dst_cont && ggml_nelements(src0) == ggml_nelements(dst)) { + const size_t bytes = ggml_nbytes(dst); + if (pt_src0 != sycl::usm::alloc::unknown && pt_dst != sycl::usm::alloc::unknown) { + SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(p_dst, p_src0, bytes))); + } else { + std::memcpy(p_dst, p_src0, bytes); + } + } else { + const int64_t ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3]; + const size_t db0 = dst->nb[0], db1 = dst->nb[1], db2 = dst->nb[2], db3 = dst->nb[3]; + const size_t sb0 = src0->nb[0], sb1 = src0->nb[1], sb2 = src0->nb[2], sb3 = src0->nb[3]; + + const size_t N = (size_t) ggml_nelements(dst); + const size_t WG = 256; + const size_t NG = ((N + WG - 1) / WG) * WG; + + const size_t ge0 = (size_t) ne0; + const size_t ge1 = ge0 * (size_t) ne1; + const size_t ge2 = ge1 * (size_t) ne2; + + q->parallel_for( + sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)), + [=](sycl::nd_item<1> it) { + size_t idx = it.get_global_linear_id(); + if (idx >= N) return; + + size_t i3 = idx / ge2; size_t r2 = idx % ge2; + size_t i2 = r2 / ge1; size_t r1 = r2 % ge1; + size_t i1 = r1 / ge0; size_t i0 = r1 % ge0; + + const char * s = (const char*)p_src0 + (i0*sb0 + i1*sb1 + i2*sb2 + i3*sb3); + char * d = (char*)p_dst + (i0*db0 + i1*db1 + i2*db2 + i3*db3); + + for (size_t b = 0; b < ts; ++b) d[b] = s[b]; + } + ); + } + } + + { + const int32_t *p = (const int32_t *) dst->op_params; + const size_t nb1 = (size_t) p[0]; + const size_t nb2 = (size_t) p[1]; + const size_t nb3 = (size_t) p[2]; + const size_t offset = (size_t) p[3]; + + const void *p_src1 = src1->data; + void *p_dst = dst->data; + + const size_t sb0 = src1->nb[0], sb1 = src1->nb[1], sb2 = src1->nb[2], sb3 = src1->nb[3]; + const size_t db0 = dst->nb[0]; + const int64_t ne0 = src1->ne[0], ne1 = src1->ne[1], ne2 = src1->ne[2], ne3 = src1->ne[3]; + + if (ggml_is_contiguous(src1) && db0 == ts) { + const size_t row_bytes = (size_t) ne0 * ts; + const char *s_base = (const char*) p_src1; + char *d_base = (char*) p_dst + offset; + + for (int64_t i3 = 0; i3 < ne3; ++i3) { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + const char *s_row = s_base + i1*sb1 + i2*sb2 + i3*sb3; + char *d_row = d_base + i1*nb1 + i2*nb2 + i3*nb3; + + auto pt_s = sycl::get_pointer_type(s_row, q->get_context()); + auto pt_d = sycl::get_pointer_type(d_row, q->get_context()); + if (pt_s != sycl::usm::alloc::unknown && pt_d != sycl::usm::alloc::unknown) { + SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(d_row, s_row, row_bytes))); + } else { + std::memcpy(d_row, s_row, row_bytes); + } + } + } + } + } else { + + const size_t N = (size_t) (ne0 * ne1 * ne2 * ne3); + const size_t WG = 256; + const size_t NG = ((N + WG - 1) / WG) * WG; + + const size_t ge0 = (size_t) ne0; + const size_t ge1 = ge0 * (size_t) ne1; + const size_t ge2 = ge1 * (size_t) ne2; + + q->parallel_for( + sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)), + [=](sycl::nd_item<1> it) { + size_t idx = it.get_global_linear_id(); + if (idx >= N) return; + + size_t i3 = idx / ge2; size_t r2 = idx % ge2; + size_t i2 = r2 / ge1; size_t r1 = r2 % ge1; + size_t i1 = r1 / ge0; size_t i0 = r1 % ge0; + + const char * s = (const char*) p_src1 + (i0*sb0 + i1*sb1 + i2*sb2 + i3*sb3); + char * d = (char*) p_dst + offset + (i0*db0 + i1*nb1 + i2*nb2 + i3*nb3); + + for (size_t b = 0; b < ts; ++b) d[b] = s[b]; + } + ); + } + } +} static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { float min_val; @@ -1124,6 +1254,11 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_pad(ctx, dst); } +void ggml_sycl_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_set(ctx, dst); +} + void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_clamp(ctx, dst); diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 50749e87d783e..3e34af325a6ad 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -83,4 +83,6 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 9404e3ff4ad9b..17bc866053da3 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3564,6 +3564,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_GET_ROWS: ggml_sycl_get_rows(ctx, dst); break; + case GGML_OP_SET: + ggml_sycl_set(ctx, dst); + break; case GGML_OP_SET_ROWS: ggml_sycl_op_set_rows(ctx, dst); break; @@ -4167,6 +4170,25 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { switch (op->op) { + case GGML_OP_SET: { +#if defined(GGML_SYCL_F16) + const bool types_ok = + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_I32) && + (op->src[0]->type == op->type) && + (op->src[1] && op->src[1]->type == op->type); +#else + const bool types_ok = + (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) && + (op->src[0]->type == op->type) && + (op->src[1] && op->src[1]->type == op->type); +#endif + + const bool contiguous_ok = + ggml_is_contiguous(op->src[0]) && + (!op->src[1] || ggml_is_contiguous(op->src[1])); + + return types_ok && contiguous_ok; +} case GGML_OP_CONV_TRANSPOSE_1D: { ggml_type src0_type = op->src[0]->type; From e654008a71fa9989158e5851fd3e25e2f0e8041d Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Tue, 16 Sep 2025 18:01:22 +0300 Subject: [PATCH 2/3] sycl(SET): re-apply post-rebase; revert manual docs/ops.md; style cleanups --- docs/ops.md | 2 +- ggml/src/ggml-sycl/element_wise.cpp | 12 ++++-------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index 89ddc86a5dc10..0047ef3fa5e53 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -85,7 +85,7 @@ Legend: | RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | -| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | +| SET | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | | SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 58c72beb7c34e..a29cbea40fbf4 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -2,8 +2,7 @@ #include "ggml-sycl/presets.hpp" #include "ggml.h" #include "element_wise.hpp" -#include - +#include #define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) @@ -939,8 +938,7 @@ static inline void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor #else GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32); #endif - const size_t ts = ggml_type_size(dst->type); - + const size_t ts = ggml_type_size(dst->type); dpct::queue_ptr q = ctx.stream(); { const bool same_type = (src0->type == dst->type); @@ -1003,9 +1001,8 @@ static inline void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor void *p_dst = dst->data; const size_t sb0 = src1->nb[0], sb1 = src1->nb[1], sb2 = src1->nb[2], sb3 = src1->nb[3]; - const size_t db0 = dst->nb[0]; + const size_t db0 = dst->nb[0]; const int64_t ne0 = src1->ne[0], ne1 = src1->ne[1], ne2 = src1->ne[2], ne3 = src1->ne[3]; - if (ggml_is_contiguous(src1) && db0 == ts) { const size_t row_bytes = (size_t) ne0 * ts; const char *s_base = (const char*) p_src1; @@ -1028,11 +1025,10 @@ static inline void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor } } } else { - + const size_t N = (size_t) (ne0 * ne1 * ne2 * ne3); const size_t WG = 256; const size_t NG = ((N + WG - 1) / WG) * WG; - const size_t ge0 = (size_t) ne0; const size_t ge1 = ge0 * (size_t) ne1; const size_t ge2 = ge1 * (size_t) ne2; From f5a0dcb566eb478ad4d34923094012b57b8007b6 Mon Sep 17 00:00:00 2001 From: Gitty Burstein Date: Thu, 18 Sep 2025 22:23:06 +0300 Subject: [PATCH 3/3] move SET op to standalone file, GPU-only implementation --- ggml/src/ggml-sycl/element_wise.cpp | 133 +----------------------- ggml/src/ggml-sycl/element_wise.hpp | 2 - ggml/src/ggml-sycl/ggml-sycl.cpp | 43 +++----- ggml/src/ggml-sycl/presets.hpp | 1 + ggml/src/ggml-sycl/set.cpp | 155 ++++++++++++++++++++++++++++ ggml/src/ggml-sycl/set.hpp | 5 + 6 files changed, 176 insertions(+), 163 deletions(-) create mode 100644 ggml/src/ggml-sycl/set.cpp create mode 100644 ggml/src/ggml-sycl/set.hpp diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index a29cbea40fbf4..c2da2fb48ad28 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -2,7 +2,7 @@ #include "ggml-sycl/presets.hpp" #include "ggml.h" #include "element_wise.hpp" -#include + #define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) @@ -926,132 +926,6 @@ static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor ggml_sycl_detail::pad_sycl(src, dst_ptr, ne00, ne01, ne02, ne0, ne1, ne2, stream); }); } -static inline void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - GGML_ASSERT(dst->src[1] != nullptr); - const ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == dst->type); - GGML_ASSERT(src1->type == dst->type); -#if defined(GGML_SYCL_F16) - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_I32); -#else - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32); -#endif - const size_t ts = ggml_type_size(dst->type); - dpct::queue_ptr q = ctx.stream(); - { - const bool same_type = (src0->type == dst->type); - const bool src_cont = ggml_is_contiguous(src0); - const bool dst_cont = ggml_is_contiguous(dst); - - const void *p_src0 = src0->data; - void *p_dst = dst->data; - - auto pt_src0 = sycl::get_pointer_type((const char*)p_src0, q->get_context()); - auto pt_dst = sycl::get_pointer_type((char*)p_dst, q->get_context()); - - if (same_type && src_cont && dst_cont && ggml_nelements(src0) == ggml_nelements(dst)) { - const size_t bytes = ggml_nbytes(dst); - if (pt_src0 != sycl::usm::alloc::unknown && pt_dst != sycl::usm::alloc::unknown) { - SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(p_dst, p_src0, bytes))); - } else { - std::memcpy(p_dst, p_src0, bytes); - } - } else { - const int64_t ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3]; - const size_t db0 = dst->nb[0], db1 = dst->nb[1], db2 = dst->nb[2], db3 = dst->nb[3]; - const size_t sb0 = src0->nb[0], sb1 = src0->nb[1], sb2 = src0->nb[2], sb3 = src0->nb[3]; - - const size_t N = (size_t) ggml_nelements(dst); - const size_t WG = 256; - const size_t NG = ((N + WG - 1) / WG) * WG; - - const size_t ge0 = (size_t) ne0; - const size_t ge1 = ge0 * (size_t) ne1; - const size_t ge2 = ge1 * (size_t) ne2; - - q->parallel_for( - sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)), - [=](sycl::nd_item<1> it) { - size_t idx = it.get_global_linear_id(); - if (idx >= N) return; - - size_t i3 = idx / ge2; size_t r2 = idx % ge2; - size_t i2 = r2 / ge1; size_t r1 = r2 % ge1; - size_t i1 = r1 / ge0; size_t i0 = r1 % ge0; - - const char * s = (const char*)p_src0 + (i0*sb0 + i1*sb1 + i2*sb2 + i3*sb3); - char * d = (char*)p_dst + (i0*db0 + i1*db1 + i2*db2 + i3*db3); - - for (size_t b = 0; b < ts; ++b) d[b] = s[b]; - } - ); - } - } - - { - const int32_t *p = (const int32_t *) dst->op_params; - const size_t nb1 = (size_t) p[0]; - const size_t nb2 = (size_t) p[1]; - const size_t nb3 = (size_t) p[2]; - const size_t offset = (size_t) p[3]; - - const void *p_src1 = src1->data; - void *p_dst = dst->data; - - const size_t sb0 = src1->nb[0], sb1 = src1->nb[1], sb2 = src1->nb[2], sb3 = src1->nb[3]; - const size_t db0 = dst->nb[0]; - const int64_t ne0 = src1->ne[0], ne1 = src1->ne[1], ne2 = src1->ne[2], ne3 = src1->ne[3]; - if (ggml_is_contiguous(src1) && db0 == ts) { - const size_t row_bytes = (size_t) ne0 * ts; - const char *s_base = (const char*) p_src1; - char *d_base = (char*) p_dst + offset; - - for (int64_t i3 = 0; i3 < ne3; ++i3) { - for (int64_t i2 = 0; i2 < ne2; ++i2) { - for (int64_t i1 = 0; i1 < ne1; ++i1) { - const char *s_row = s_base + i1*sb1 + i2*sb2 + i3*sb3; - char *d_row = d_base + i1*nb1 + i2*nb2 + i3*nb3; - - auto pt_s = sycl::get_pointer_type(s_row, q->get_context()); - auto pt_d = sycl::get_pointer_type(d_row, q->get_context()); - if (pt_s != sycl::usm::alloc::unknown && pt_d != sycl::usm::alloc::unknown) { - SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(d_row, s_row, row_bytes))); - } else { - std::memcpy(d_row, s_row, row_bytes); - } - } - } - } - } else { - - const size_t N = (size_t) (ne0 * ne1 * ne2 * ne3); - const size_t WG = 256; - const size_t NG = ((N + WG - 1) / WG) * WG; - const size_t ge0 = (size_t) ne0; - const size_t ge1 = ge0 * (size_t) ne1; - const size_t ge2 = ge1 * (size_t) ne2; - - q->parallel_for( - sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)), - [=](sycl::nd_item<1> it) { - size_t idx = it.get_global_linear_id(); - if (idx >= N) return; - - size_t i3 = idx / ge2; size_t r2 = idx % ge2; - size_t i2 = r2 / ge1; size_t r1 = r2 % ge1; - size_t i1 = r1 / ge0; size_t i0 = r1 % ge0; - - const char * s = (const char*) p_src1 + (i0*sb0 + i1*sb1 + i2*sb2 + i3*sb3); - char * d = (char*) p_dst + offset + (i0*db0 + i1*nb1 + i2*nb2 + i3*nb3); - - for (size_t b = 0; b < ts; ++b) d[b] = s[b]; - } - ); - } - } -} static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { float min_val; @@ -1250,11 +1124,6 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_pad(ctx, dst); } -void ggml_sycl_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - ggml_sycl_op_set(ctx, dst); -} - void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_clamp(ctx, dst); diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 3e34af325a6ad..50749e87d783e 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -83,6 +83,4 @@ void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 17bc866053da3..ec00446f25019 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -42,6 +42,7 @@ #include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" #include "ggml-sycl/set_rows.hpp" +#include "ggml-sycl/set.hpp" #include "ggml-sycl/sycl_hw.hpp" #include "ggml-sycl/getrows.hpp" #include "ggml-sycl/quantize.hpp" @@ -3565,7 +3566,7 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg ggml_sycl_get_rows(ctx, dst); break; case GGML_OP_SET: - ggml_sycl_set(ctx, dst); + ggml_sycl_op_set(ctx, dst); break; case GGML_OP_SET_ROWS: ggml_sycl_op_set_rows(ctx, dst); @@ -4170,34 +4171,6 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { switch (op->op) { - case GGML_OP_SET: { -#if defined(GGML_SYCL_F16) - const bool types_ok = - (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_I32) && - (op->src[0]->type == op->type) && - (op->src[1] && op->src[1]->type == op->type); -#else - const bool types_ok = - (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) && - (op->src[0]->type == op->type) && - (op->src[1] && op->src[1]->type == op->type); -#endif - - const bool contiguous_ok = - ggml_is_contiguous(op->src[0]) && - (!op->src[1] || ggml_is_contiguous(op->src[1])); - - return types_ok && contiguous_ok; -} - case GGML_OP_CONV_TRANSPOSE_1D: - { - ggml_type src0_type = op->src[0]->type; - ggml_type src1_type = op->src[1]->type; - if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { - return true; - } - return false; - } case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_NEG: @@ -4288,6 +4261,18 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return false; } } + case GGML_OP_SET: + #if defined(GGML_SYCL_F16) + return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_I32) && + (op->src[0] && op->src[1]) && + (op->src[0]->type == op->type) && + (op->src[1]->type == op->type); + #else + return (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_I32) && + (op->src[0] && op->src[1]) && + (op->src[0]->type == op->type) && + (op->src[1]->type == op->type); + #endif case GGML_OP_SET_ROWS: { return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index af1890727df8f..f2e1f02a172ab 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -31,6 +31,7 @@ #define SYCL_SQRT_BLOCK_SIZE 256 #define SYCL_SIN_BLOCK_SIZE 256 #define SYCL_SQR_BLOCK_SIZE 256 +#define SYCL_SET_BLOCK_SIZE 256 #define SYCL_CPY_BLOCK_SIZE 32 #define SYCL_SCALE_BLOCK_SIZE 256 #define SYCL_CLAMP_BLOCK_SIZE 256 diff --git a/ggml/src/ggml-sycl/set.cpp b/ggml/src/ggml-sycl/set.cpp new file mode 100644 index 0000000000000..6395ac649e90b --- /dev/null +++ b/ggml/src/ggml-sycl/set.cpp @@ -0,0 +1,155 @@ +// ggml/src/ggml-sycl/set.cpp +// +// SYCL backend for GGML SET operator. +// +// Semantics: +// 1) dst <- src0 +// 2) copy a sub-block from src1 into dst at byte `offset`, +// using destination byte-strides (nb1, nb2, nb3) for dims 1..3. +// +// Notes: +// - (nb1, nb2, nb3, offset) are BYTES (CPU-compatible). +// - Uses two fast paths (bulk memcpy; row-wise memcpy) and a generic 4D kernel. +// - Work-group size is configured in presets (SYCL_SET_BLOCK_SIZE). +// +// Implementation style aligned with other SYCL operators: +// - No host std::memcpy fallback; no USM detection. +// - Copies use queue->memcpy; generic case uses a parallel_for kernel. + +#include "presets.hpp" // SYCL_* tuning (incl. SYCL_SET_BLOCK_SIZE) +#include "common.hpp" +#include "ggml.h" +#include "set.hpp" + +#include +#include + +// ---------------- helpers (file-local) ---------------- + +// Byte-accurate 4D copy with independent src/dst byte strides. +// One work-item copies exactly one element (ts bytes). +static inline void launch_copy_4d_bytes( + dpct::queue_ptr q, + const void *p_src, void *p_dst, + const int64_t ne[4], + const size_t sb[4], + const size_t db[4], + const size_t ts +) { + const size_t N = (size_t)(ne[0] * ne[1] * ne[2] * ne[3]); + if (N == 0) return; + + const size_t WG = (size_t)SYCL_SET_BLOCK_SIZE; + const size_t NG = ((N + WG - 1) / WG) * WG; + + const size_t ge0 = (size_t) ne[0]; + const size_t ge1 = ge0 * (size_t) ne[1]; + const size_t ge2 = ge1 * (size_t) ne[2]; + + q->parallel_for( + sycl::nd_range<1>(sycl::range<1>(NG), sycl::range<1>(WG)), + [=](sycl::nd_item<1> it) { + size_t idx = it.get_global_linear_id(); + if (idx >= N) return; + + // 4D indexing + size_t i3 = idx / ge2; size_t r2 = idx % ge2; + size_t i2 = r2 / ge1; size_t r1 = r2 % ge1; + size_t i1 = r1 / ge0; size_t i0 = r1 % ge0; + + const char *s = (const char *)p_src + (i0*sb[0] + i1*sb[1] + i2*sb[2] + i3*sb[3]); + char *d = (char *)p_dst + (i0*db[0] + i1*db[1] + i2*db[2] + i3*db[3]); + + #pragma unroll + for (size_t b = 0; b < ts; ++b) { + d[b] = s[b]; + } + } + ); +} + +// --------------------------- operator --------------------------- + +void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst != nullptr); + const ggml_tensor * src0 = dst->src[0]; + GGML_ASSERT(dst->src[1] != nullptr); + const ggml_tensor * src1 = dst->src[1]; + GGML_ASSERT(src0 && src1); + + // Type constraints (CPU-compatible) + GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(src1->type == dst->type); +#if defined(GGML_SYCL_F16) + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_I32); +#else + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_I32); +#endif + + dpct::queue_ptr q = ctx.stream(); + const size_t ts = ggml_type_size(dst->type); + + // Stage 1: dst <- src0 + { + const bool same_type = (src0->type == dst->type); + const bool src_cont = ggml_is_contiguous(src0); + const bool dst_cont = ggml_is_contiguous(dst); + + const void *p_src0 = src0->data; + void *p_dst = dst->data; + + if (same_type && src_cont && dst_cont && + ggml_nelements(src0) == ggml_nelements(dst)) { + SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(p_dst, p_src0, ggml_nbytes(dst)))); + } else { + // generic 4D copy + const int64_t ne[4] = { dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3] }; + const size_t sb[4] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] }; + const size_t db[4] = { dst ->nb[0], dst ->nb[1], dst ->nb[2], dst ->nb[3] }; + launch_copy_4d_bytes(q, p_src0, p_dst, ne, sb, db, ts); + } + } + + // Stage 2: paste src1 sub-block into dst + { + // op_params: [ nb1, nb2, nb3, offset ] (BYTES) + const int32_t *p = (const int32_t *) dst->op_params; + const size_t nb1 = (size_t) p[0]; + const size_t nb2 = (size_t) p[1]; + const size_t nb3 = (size_t) p[2]; + const size_t offset = (size_t) p[3]; + + const void *p_src1 = src1->data; + void *p_base = (char *) dst->data + offset; + + const bool src1_cont = ggml_is_contiguous(src1); + const bool dst_tight = (dst->nb[0] == ts); // tightly-packed rows + + if (src1_cont && dst_tight) { + // Row-wise device memcpy of src1 into dst at the given offset + const char *s_base = (const char *) p_src1; + char *d_base = (char *) p_base; + const size_t row_bytes = (size_t) src1->ne[0] * ts; + + const size_t sb1 = src1->nb[1]; + const size_t sb2 = src1->nb[2]; + const size_t sb3 = src1->nb[3]; + + for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) { + for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) { + for (int64_t i1 = 0; i1 < src1->ne[1]; ++i1) { + const char *s_row = s_base + i1*sb1 + i2*sb2 + i3*sb3; + char *d_row = d_base + i1*nb1 + i2*nb2 + i3*nb3; + SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(d_row, s_row, row_bytes))); + } + } + } + } else { + // Generic 4D copy from src1 into (offsetted) dst base + const int64_t ne[4] = { src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3] }; + const size_t sb[4] = { src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3] }; + const size_t db[4] = { dst->nb[0], nb1, nb2, nb3 }; + launch_copy_4d_bytes(q, p_src1, p_base, ne, sb, db, ts); + } + } +} diff --git a/ggml/src/ggml-sycl/set.hpp b/ggml/src/ggml-sycl/set.hpp new file mode 100644 index 0000000000000..4cb9729d0a889 --- /dev/null +++ b/ggml/src/ggml-sycl/set.hpp @@ -0,0 +1,5 @@ +#pragma once +#include "backend.hpp" +#include "ggml.h" + +void ggml_sycl_op_set(ggml_backend_sycl_context & ctx, ggml_tensor * dst);