From 4b76ed16bca304e9450ce50b97e7b86a449c7464 Mon Sep 17 00:00:00 2001 From: tamarPal Date: Sun, 19 Oct 2025 13:35:56 +0300 Subject: [PATCH 1/5] sycl: add ROLL operation support - Implement ggml_sycl_roll function for F32 tensors - Add multi-axis roll operation with SYCL kernel - Support all 4 tensor dimensions with proper shift normalization - Add roll.cpp and roll.hpp to SYCL backend - Update backend dispatch and supports_op for GGML_OP_ROLL - Tests: 17662/17662 pass with identical CPU reference results --- ggml/src/ggml-sycl/backend.hpp | 1 + ggml/src/ggml-sycl/ggml-sycl.cpp | 5 ++ ggml/src/ggml-sycl/roll.cpp | 82 ++++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/roll.hpp | 20 ++++++++ 4 files changed, 108 insertions(+) create mode 100644 ggml/src/ggml-sycl/roll.cpp create mode 100644 ggml/src/ggml-sycl/roll.hpp diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index 6ff3215d5a439..8df5bfa5625bd 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -32,6 +32,7 @@ #include "pad.hpp" #include "quantize.hpp" #include "quants.hpp" +#include "roll.hpp" #include "rope.hpp" #include "set_rows.hpp" #include "softmax.hpp" diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a7e077ec8ebe0..e80f77edf1502 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3836,6 +3836,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_GATED_LINEAR_ATTN: ggml_sycl_op_gated_linear_attn(ctx, dst); break; + case GGML_OP_ROLL: + ggml_sycl_roll(ctx, dst); + break; case GGML_OP_ARANGE: ggml_sycl_arange(ctx, dst); break; @@ -4491,6 +4494,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_RWKV_WKV7: case GGML_OP_GATED_LINEAR_ATTN: return true; + case GGML_OP_ROLL: + return op->type == GGML_TYPE_F32; case GGML_OP_ARANGE: return op->type == GGML_TYPE_F32; default: diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp new file mode 100644 index 0000000000000..965bb11b7a653 --- /dev/null +++ b/ggml/src/ggml-sycl/roll.cpp @@ -0,0 +1,82 @@ +#include "roll.hpp" +#include "common.hpp" + +using namespace sycl; + +static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst, + int shift0, int shift1, int shift2, int shift3) { + if (!src || !dst) throw std::runtime_error("null tensor"); + if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) + throw std::runtime_error("only F32 supported in SYCL roll"); + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + if (ne0 != src->ne[0] || ne1 != src->ne[1] || ne2 != src->ne[2] || ne3 != src->ne[3]) + throw std::runtime_error("src/dst shape mismatch"); + + // Normalize shifts to be within bounds + const int64_t sh0 = ne0 > 0 ? ((int64_t)shift0 % ne0 + ne0) % ne0 : 0; + const int64_t sh1 = ne1 > 0 ? ((int64_t)shift1 % ne1 + ne1) % ne1 : 0; + const int64_t sh2 = ne2 > 0 ? ((int64_t)shift2 % ne2 + ne2) % ne2 : 0; + const int64_t sh3 = ne3 > 0 ? ((int64_t)shift3 % ne3 + ne3) % ne3 : 0; + + const float *src_d = (const float*) src->data; + float *dst_d = (float*) dst->data; + + if (!src_d || !dst_d) throw std::runtime_error("null data pointers"); + + q.submit([&](handler &h) { + range<3> r((size_t)ne3, (size_t)ne2, (size_t)ne1); + h.parallel_for(r, [=](id<3> idx) { + const int64_t i3 = idx[0]; + const int64_t i2 = idx[1]; + const int64_t i1 = idx[2]; + + for (int64_t i0 = 0; i0 < ne0; i0++) { + const int64_t idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2; + + // Apply shift to each dimension + const int64_t src_i0 = (i0 - sh0 + ne0) % ne0; + const int64_t src_i1 = (i1 - sh1 + ne1) % ne1; + const int64_t src_i2 = (i2 - sh2 + ne2) % ne2; + const int64_t src_i3 = (i3 - sh3 + ne3) % ne3; + + const int64_t idx_src = src_i0 + src_i1 * ne0 + + src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2; + + dst_d[idx_dst] = src_d[idx_src]; + } + }); + }).wait(); +} + +void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const ggml_tensor *src = dst->src[0]; + + const int32_t *params = (const int32_t *)dst->op_params; + const int shift0 = params[0]; + const int shift1 = params[1]; + const int shift2 = params[2]; + const int shift3 = params[3]; + + // Check if all shifts are zero + if (shift0 == 0 && shift1 == 0 && shift2 == 0 && shift3 == 0) { + const size_t nb = ggml_nbytes(src); + queue *q = ctx.stream(); + SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb).wait())); + return; + } + + try { + queue *q = ctx.stream(); + kernel_roll_multi_axis(*q, src, dst, shift0, shift1, shift2, shift3); + } catch (const std::exception &e) { + std::fprintf(stderr, "[SYCL-ROLL] ERROR: %s\n", e.what()); + throw; + } +} diff --git a/ggml/src/ggml-sycl/roll.hpp b/ggml/src/ggml-sycl/roll.hpp new file mode 100644 index 0000000000000..6356cfe2eb306 --- /dev/null +++ b/ggml/src/ggml-sycl/roll.hpp @@ -0,0 +1,20 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_ROLL_HPP +#define GGML_SYCL_ROLL_HPP + +#include "common.hpp" + +void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst); + +#endif // GGML_SYCL_ROLL_HPP \ No newline at end of file From dea85e1560c1dec38f96dd7f13d1be2230a4fe63 Mon Sep 17 00:00:00 2001 From: tamarPal Date: Tue, 21 Oct 2025 14:49:44 +0300 Subject: [PATCH 2/5] fix: remove trailing whitespace from roll.cpp - Fix EditorConfig violations in ggml/src/ggml-sycl/roll.cpp - Remove trailing spaces from lines 6, 11, 28, 47, 58, 60 --- ggml/src/ggml-sycl/roll.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp index 965bb11b7a653..96241f3d501de 100644 --- a/ggml/src/ggml-sycl/roll.cpp +++ b/ggml/src/ggml-sycl/roll.cpp @@ -3,12 +3,12 @@ using namespace sycl; -static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst, +static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst, int shift0, int shift1, int shift2, int shift3) { if (!src || !dst) throw std::runtime_error("null tensor"); if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) throw std::runtime_error("only F32 supported in SYCL roll"); - + const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; const int64_t ne2 = dst->ne[2]; @@ -25,7 +25,7 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor const float *src_d = (const float*) src->data; float *dst_d = (float*) dst->data; - + if (!src_d || !dst_d) throw std::runtime_error("null data pointers"); q.submit([&](handler &h) { @@ -44,7 +44,7 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor const int64_t src_i2 = (i2 - sh2 + ne2) % ne2; const int64_t src_i3 = (i3 - sh3 + ne3) % ne3; - const int64_t idx_src = src_i0 + src_i1 * ne0 + + const int64_t idx_src = src_i0 + src_i1 * ne0 + src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2; dst_d[idx_dst] = src_d[idx_src]; @@ -55,9 +55,9 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->type == GGML_TYPE_F32); - + const ggml_tensor *src = dst->src[0]; - + const int32_t *params = (const int32_t *)dst->op_params; const int shift0 = params[0]; const int shift1 = params[1]; From 386db09d5803698a0239b2973a1118b2a934319d Mon Sep 17 00:00:00 2001 From: tamarPal Date: Tue, 21 Oct 2025 15:32:09 +0300 Subject: [PATCH 3/5] ci: retrigger From 710405b4126d21863801f9d1996b7cfa5893d218 Mon Sep 17 00:00:00 2001 From: tamarPal Date: Wed, 22 Oct 2025 18:30:40 +0300 Subject: [PATCH 4/5] sycl: remove wait() calls from ROLL operation --- ggml/src/ggml-sycl/roll.cpp | 52 +++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp index 96241f3d501de..f0133c365475c 100644 --- a/ggml/src/ggml-sycl/roll.cpp +++ b/ggml/src/ggml-sycl/roll.cpp @@ -9,19 +9,25 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) throw std::runtime_error("only F32 supported in SYCL roll"); - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + const int ne3 = dst->ne[3]; if (ne0 != src->ne[0] || ne1 != src->ne[1] || ne2 != src->ne[2] || ne3 != src->ne[3]) throw std::runtime_error("src/dst shape mismatch"); - // Normalize shifts to be within bounds - const int64_t sh0 = ne0 > 0 ? ((int64_t)shift0 % ne0 + ne0) % ne0 : 0; - const int64_t sh1 = ne1 > 0 ? ((int64_t)shift1 % ne1 + ne1) % ne1 : 0; - const int64_t sh2 = ne2 > 0 ? ((int64_t)shift2 % ne2 + ne2) % ne2 : 0; - const int64_t sh3 = ne3 > 0 ? ((int64_t)shift3 % ne3 + ne3) % ne3 : 0; + + const int sh0 = ne0 > 0 ? ((int)shift0 % ne0 + ne0) % ne0 : 0; + const int sh1 = ne1 > 0 ? ((int)shift1 % ne1 + ne1) % ne1 : 0; + const int sh2 = ne2 > 0 ? ((int)shift2 % ne2 + ne2) % ne2 : 0; + const int sh3 = ne3 > 0 ? ((int)shift3 % ne3 + ne3) % ne3 : 0; + + + const int shNe0 = ne0 - sh0; + const int shNe1 = ne1 - sh1; + const int shNe2 = ne2 - sh2; + const int shNe3 = ne3 - sh3; const float *src_d = (const float*) src->data; float *dst_d = (float*) dst->data; @@ -31,26 +37,26 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor q.submit([&](handler &h) { range<3> r((size_t)ne3, (size_t)ne2, (size_t)ne1); h.parallel_for(r, [=](id<3> idx) { - const int64_t i3 = idx[0]; - const int64_t i2 = idx[1]; - const int64_t i1 = idx[2]; + const int i3 = (int)idx[0]; + const int i2 = (int)idx[1]; + const int i1 = (int)idx[2]; - for (int64_t i0 = 0; i0 < ne0; i0++) { - const int64_t idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2; + for (int i0 = 0; i0 < ne0; i0++) { + const int idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2; - // Apply shift to each dimension - const int64_t src_i0 = (i0 - sh0 + ne0) % ne0; - const int64_t src_i1 = (i1 - sh1 + ne1) % ne1; - const int64_t src_i2 = (i2 - sh2 + ne2) % ne2; - const int64_t src_i3 = (i3 - sh3 + ne3) % ne3; - const int64_t idx_src = src_i0 + src_i1 * ne0 + + const int src_i0 = (i0 + shNe0) % ne0; + const int src_i1 = (i1 + shNe1) % ne1; + const int src_i2 = (i2 + shNe2) % ne2; + const int src_i3 = (i3 + shNe3) % ne3; + + const int idx_src = src_i0 + src_i1 * ne0 + src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2; dst_d[idx_dst] = src_d[idx_src]; } }); - }).wait(); + }); } void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { @@ -64,11 +70,11 @@ void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { const int shift2 = params[2]; const int shift3 = params[3]; - // Check if all shifts are zero + if (shift0 == 0 && shift1 == 0 && shift2 == 0 && shift3 == 0) { const size_t nb = ggml_nbytes(src); queue *q = ctx.stream(); - SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb).wait())); + SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb))); return; } From b9793080f99b51fb6c6f19ae4aa6f10550eea91d Mon Sep 17 00:00:00 2001 From: tamarPal Date: Thu, 23 Oct 2025 10:08:36 +0300 Subject: [PATCH 5/5] =?UTF-8?q?fix:=20editorconfig=20=E2=80=94=20LF=20endi?= =?UTF-8?q?ngs=20+=20final=20newline=20for=20roll.hpp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ggml/src/ggml-sycl/roll.cpp | 122 +++++++++++++++++++++++------------- ggml/src/ggml-sycl/roll.hpp | 2 +- 2 files changed, 79 insertions(+), 45 deletions(-) diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp index f0133c365475c..1e05181789c28 100644 --- a/ggml/src/ggml-sycl/roll.cpp +++ b/ggml/src/ggml-sycl/roll.cpp @@ -3,58 +3,66 @@ using namespace sycl; -static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst, - int shift0, int shift1, int shift2, int shift3) { - if (!src || !dst) throw std::runtime_error("null tensor"); - if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32) - throw std::runtime_error("only F32 supported in SYCL roll"); +static inline int wrap_add(int i, int shift, int n) { - const int ne0 = dst->ne[0]; - const int ne1 = dst->ne[1]; - const int ne2 = dst->ne[2]; - const int ne3 = dst->ne[3]; + int s = i + shift; + return (s >= n) ? (s - n) : s; +} + +static void kernel_roll_fused_i0_i1( + queue &q, + const float *src_d, + float *dst_d, + int ne0, int ne1, int ne2, int ne3, + int sh0, int sh1, int sh2, int sh3) +{ + if (ne0 == 0 || ne1 == 0 || ne2 == 0 || ne3 == 0) return; - if (ne0 != src->ne[0] || ne1 != src->ne[1] || ne2 != src->ne[2] || ne3 != src->ne[3]) - throw std::runtime_error("src/dst shape mismatch"); + const int stride1 = ne0; + const int stride2 = ne0 * ne1; + const int stride3 = ne0 * ne1 * ne2; - const int sh0 = ne0 > 0 ? ((int)shift0 % ne0 + ne0) % ne0 : 0; - const int sh1 = ne1 > 0 ? ((int)shift1 % ne1 + ne1) % ne1 : 0; - const int sh2 = ne2 > 0 ? ((int)shift2 % ne2 + ne2) % ne2 : 0; - const int sh3 = ne3 > 0 ? ((int)shift3 % ne3 + ne3) % ne3 : 0; + const int shNe0 = (ne0 - sh0) % ne0; + const int shNe1 = (ne1 - sh1) % ne1; + const int shNe2 = (ne2 - sh2) % ne2; + const int shNe3 = (ne3 - sh3) % ne3; - const int shNe0 = ne0 - sh0; - const int shNe1 = ne1 - sh1; - const int shNe2 = ne2 - sh2; - const int shNe3 = ne3 - sh3; - const float *src_d = (const float*) src->data; - float *dst_d = (float*) dst->data; + const size_t g0 = (size_t) ne3; + const size_t g1 = (size_t) ne2; + const size_t g2 = (size_t) (ne1 * ne0); - if (!src_d || !dst_d) throw std::runtime_error("null data pointers"); + const range<3> global{ g0, g1, g2 }; q.submit([&](handler &h) { - range<3> r((size_t)ne3, (size_t)ne2, (size_t)ne1); - h.parallel_for(r, [=](id<3> idx) { - const int i3 = (int)idx[0]; - const int i2 = (int)idx[1]; - const int i1 = (int)idx[2]; + h.parallel_for(global, [=](id<3> idx) { + const int i3 = (int) idx[0]; + const int i2 = (int) idx[1]; + + const int fused = (int) idx[2]; + const int i1 = fused / ne0; + const int i0 = fused - i1 * ne0; // fused % ne0 + - for (int i0 = 0; i0 < ne0; i0++) { - const int idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2; + const int idx_dst = i0 + + i1 * stride1 + + i2 * stride2 + + i3 * stride3; - const int src_i0 = (i0 + shNe0) % ne0; - const int src_i1 = (i1 + shNe1) % ne1; - const int src_i2 = (i2 + shNe2) % ne2; - const int src_i3 = (i3 + shNe3) % ne3; + const int s0 = wrap_add(i0, shNe0, ne0); + const int s1 = wrap_add(i1, shNe1, ne1); + const int s2 = wrap_add(i2, shNe2, ne2); + const int s3 = wrap_add(i3, shNe3, ne3); - const int idx_src = src_i0 + src_i1 * ne0 + - src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2; + const int idx_src = s0 + + s1 * stride1 + + s2 * stride2 + + s3 * stride3; - dst_d[idx_dst] = src_d[idx_src]; - } + dst_d[idx_dst] = src_d[idx_src]; }); }); } @@ -63,24 +71,50 @@ void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->type == GGML_TYPE_F32); const ggml_tensor *src = dst->src[0]; + GGML_ASSERT(src && src->type == GGML_TYPE_F32); - const int32_t *params = (const int32_t *)dst->op_params; - const int shift0 = params[0]; - const int shift1 = params[1]; - const int shift2 = params[2]; - const int shift3 = params[3]; + const int ne0 = (int) dst->ne[0]; + const int ne1 = (int) dst->ne[1]; + const int ne2 = (int) dst->ne[2]; + const int ne3 = (int) dst->ne[3]; + const int32_t *params = (const int32_t *) dst->op_params; + int shift0 = params[0]; + int shift1 = params[1]; + int shift2 = params[2]; + int shift3 = params[3]; - if (shift0 == 0 && shift1 == 0 && shift2 == 0 && shift3 == 0) { + + if ((shift0 | shift1 | shift2 | shift3) == 0) { const size_t nb = ggml_nbytes(src); queue *q = ctx.stream(); SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb))); return; } + auto norm = [](int sh, int n) -> int { + if (n <= 0) return 0; + sh %= n; + if (sh < 0) sh += n; + return sh; + }; + shift0 = norm(shift0, ne0); + shift1 = norm(shift1, ne1); + shift2 = norm(shift2, ne2); + shift3 = norm(shift3, ne3); + try { queue *q = ctx.stream(); - kernel_roll_multi_axis(*q, src, dst, shift0, shift1, shift2, shift3); + + const float *src_d = (const float *) src->data; + float *dst_d = (float *) dst->data; + GGML_ASSERT(src_d && dst_d); + + kernel_roll_fused_i0_i1( + *q, src_d, dst_d, + ne0, ne1, ne2, ne3, + shift0, shift1, shift2, shift3 + ); } catch (const std::exception &e) { std::fprintf(stderr, "[SYCL-ROLL] ERROR: %s\n", e.what()); throw; diff --git a/ggml/src/ggml-sycl/roll.hpp b/ggml/src/ggml-sycl/roll.hpp index 6356cfe2eb306..97dc03d64b24d 100644 --- a/ggml/src/ggml-sycl/roll.hpp +++ b/ggml/src/ggml-sycl/roll.hpp @@ -17,4 +17,4 @@ void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst); -#endif // GGML_SYCL_ROLL_HPP \ No newline at end of file +#endif // GGML_SYCL_ROLL_HPP