From 4b76ed16bca304e9450ce50b97e7b86a449c7464 Mon Sep 17 00:00:00 2001
From: tamarPal <tamarPal@example.com>
Date: Sun, 19 Oct 2025 13:35:56 +0300
Subject: [PATCH 1/5] sycl: add ROLL operation support

- Implement ggml_sycl_roll function for F32 tensors
- Add multi-axis roll operation with SYCL kernel
- Support all 4 tensor dimensions with proper shift normalization
- Add roll.cpp and roll.hpp to SYCL backend
- Update backend dispatch and supports_op for GGML_OP_ROLL
- Tests: 17662/17662 pass with identical CPU reference results
---
 ggml/src/ggml-sycl/backend.hpp   |  1 +
 ggml/src/ggml-sycl/ggml-sycl.cpp |  5 ++
 ggml/src/ggml-sycl/roll.cpp      | 82 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/roll.hpp      | 20 ++++++++
 4 files changed, 108 insertions(+)
 create mode 100644 ggml/src/ggml-sycl/roll.cpp
 create mode 100644 ggml/src/ggml-sycl/roll.hpp

diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp
index 6ff3215d5a439..8df5bfa5625bd 100644
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -32,6 +32,7 @@
 #include "pad.hpp"
 #include "quantize.hpp"
 #include "quants.hpp"
+#include "roll.hpp"
 #include "rope.hpp"
 #include "set_rows.hpp"
 #include "softmax.hpp"
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index a7e077ec8ebe0..e80f77edf1502 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3836,6 +3836,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
         case GGML_OP_GATED_LINEAR_ATTN:
             ggml_sycl_op_gated_linear_attn(ctx, dst);
             break;
+        case GGML_OP_ROLL:
+            ggml_sycl_roll(ctx, dst);
+            break;
         case GGML_OP_ARANGE:
             ggml_sycl_arange(ctx, dst);
             break;
@@ -4491,6 +4494,8 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_RWKV_WKV7:
         case GGML_OP_GATED_LINEAR_ATTN:
             return true;
+        case GGML_OP_ROLL:
+            return op->type == GGML_TYPE_F32;
         case GGML_OP_ARANGE:
             return op->type == GGML_TYPE_F32;
         default:
diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp
new file mode 100644
index 0000000000000..965bb11b7a653
--- /dev/null
+++ b/ggml/src/ggml-sycl/roll.cpp
@@ -0,0 +1,82 @@
+#include "roll.hpp"
+#include "common.hpp"
+
+using namespace sycl;
+
+static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst, 
+                                    int shift0, int shift1, int shift2, int shift3) {
+    if (!src || !dst) throw std::runtime_error("null tensor");
+    if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32)
+        throw std::runtime_error("only F32 supported in SYCL roll");
+    
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t ne3 = dst->ne[3];
+
+    if (ne0 != src->ne[0] || ne1 != src->ne[1] || ne2 != src->ne[2] || ne3 != src->ne[3])
+        throw std::runtime_error("src/dst shape mismatch");
+
+    // Normalize shifts to be within bounds
+    const int64_t sh0 = ne0 > 0 ? ((int64_t)shift0 % ne0 + ne0) % ne0 : 0;
+    const int64_t sh1 = ne1 > 0 ? ((int64_t)shift1 % ne1 + ne1) % ne1 : 0;
+    const int64_t sh2 = ne2 > 0 ? ((int64_t)shift2 % ne2 + ne2) % ne2 : 0;
+    const int64_t sh3 = ne3 > 0 ? ((int64_t)shift3 % ne3 + ne3) % ne3 : 0;
+
+    const float *src_d = (const float*) src->data;
+    float *dst_d = (float*) dst->data;
+    
+    if (!src_d || !dst_d) throw std::runtime_error("null data pointers");
+
+    q.submit([&](handler &h) {
+        range<3> r((size_t)ne3, (size_t)ne2, (size_t)ne1);
+        h.parallel_for(r, [=](id<3> idx) {
+            const int64_t i3 = idx[0];
+            const int64_t i2 = idx[1];
+            const int64_t i1 = idx[2];
+
+            for (int64_t i0 = 0; i0 < ne0; i0++) {
+                const int64_t idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2;
+
+                // Apply shift to each dimension
+                const int64_t src_i0 = (i0 - sh0 + ne0) % ne0;
+                const int64_t src_i1 = (i1 - sh1 + ne1) % ne1;
+                const int64_t src_i2 = (i2 - sh2 + ne2) % ne2;
+                const int64_t src_i3 = (i3 - sh3 + ne3) % ne3;
+
+                const int64_t idx_src = src_i0 + src_i1 * ne0 + 
+                                        src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2;
+
+                dst_d[idx_dst] = src_d[idx_src];
+            }
+        });
+    }).wait();
+}
+
+void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    
+    const ggml_tensor *src = dst->src[0];
+    
+    const int32_t *params = (const int32_t *)dst->op_params;
+    const int shift0 = params[0];
+    const int shift1 = params[1];
+    const int shift2 = params[2];
+    const int shift3 = params[3];
+
+    // Check if all shifts are zero
+    if (shift0 == 0 && shift1 == 0 && shift2 == 0 && shift3 == 0) {
+        const size_t nb = ggml_nbytes(src);
+        queue *q = ctx.stream();
+        SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb).wait()));
+        return;
+    }
+
+    try {
+        queue *q = ctx.stream();
+        kernel_roll_multi_axis(*q, src, dst, shift0, shift1, shift2, shift3);
+    } catch (const std::exception &e) {
+        std::fprintf(stderr, "[SYCL-ROLL] ERROR: %s\n", e.what());
+        throw;
+    }
+}
diff --git a/ggml/src/ggml-sycl/roll.hpp b/ggml/src/ggml-sycl/roll.hpp
new file mode 100644
index 0000000000000..6356cfe2eb306
--- /dev/null
+++ b/ggml/src/ggml-sycl/roll.hpp
@@ -0,0 +1,20 @@
+//
+// MIT license
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_ROLL_HPP
+#define GGML_SYCL_ROLL_HPP
+
+#include "common.hpp"
+
+void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
+
+#endif // GGML_SYCL_ROLL_HPP
\ No newline at end of file

From dea85e1560c1dec38f96dd7f13d1be2230a4fe63 Mon Sep 17 00:00:00 2001
From: tamarPal <tamarPal@example.com>
Date: Tue, 21 Oct 2025 14:49:44 +0300
Subject: [PATCH 2/5] fix: remove trailing whitespace from roll.cpp

- Fix EditorConfig violations in ggml/src/ggml-sycl/roll.cpp
- Remove trailing spaces from lines 6, 11, 28, 47, 58, 60
---
 ggml/src/ggml-sycl/roll.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp
index 965bb11b7a653..96241f3d501de 100644
--- a/ggml/src/ggml-sycl/roll.cpp
+++ b/ggml/src/ggml-sycl/roll.cpp
@@ -3,12 +3,12 @@
 
 using namespace sycl;
 
-static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst, 
+static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst,
                                     int shift0, int shift1, int shift2, int shift3) {
     if (!src || !dst) throw std::runtime_error("null tensor");
     if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32)
         throw std::runtime_error("only F32 supported in SYCL roll");
-    
+
     const int64_t ne0 = dst->ne[0];
     const int64_t ne1 = dst->ne[1];
     const int64_t ne2 = dst->ne[2];
@@ -25,7 +25,7 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor
 
     const float *src_d = (const float*) src->data;
     float *dst_d = (float*) dst->data;
-    
+
     if (!src_d || !dst_d) throw std::runtime_error("null data pointers");
 
     q.submit([&](handler &h) {
@@ -44,7 +44,7 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor
                 const int64_t src_i2 = (i2 - sh2 + ne2) % ne2;
                 const int64_t src_i3 = (i3 - sh3 + ne3) % ne3;
 
-                const int64_t idx_src = src_i0 + src_i1 * ne0 + 
+                const int64_t idx_src = src_i0 + src_i1 * ne0 +
                                         src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2;
 
                 dst_d[idx_dst] = src_d[idx_src];
@@ -55,9 +55,9 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor
 
 void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    
+
     const ggml_tensor *src = dst->src[0];
-    
+
     const int32_t *params = (const int32_t *)dst->op_params;
     const int shift0 = params[0];
     const int shift1 = params[1];

From 386db09d5803698a0239b2973a1118b2a934319d Mon Sep 17 00:00:00 2001
From: tamarPal <tamarPal@example.com>
Date: Tue, 21 Oct 2025 15:32:09 +0300
Subject: [PATCH 3/5] ci: retrigger


From 710405b4126d21863801f9d1996b7cfa5893d218 Mon Sep 17 00:00:00 2001
From: tamarPal <tamarPal@example.com>
Date: Wed, 22 Oct 2025 18:30:40 +0300
Subject: [PATCH 4/5] sycl: remove wait() calls from ROLL operation

---
 ggml/src/ggml-sycl/roll.cpp | 52 +++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp
index 96241f3d501de..f0133c365475c 100644
--- a/ggml/src/ggml-sycl/roll.cpp
+++ b/ggml/src/ggml-sycl/roll.cpp
@@ -9,19 +9,25 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor
     if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32)
         throw std::runtime_error("only F32 supported in SYCL roll");
 
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
+    const int ne0 = dst->ne[0];
+    const int ne1 = dst->ne[1];
+    const int ne2 = dst->ne[2];
+    const int ne3 = dst->ne[3];
 
     if (ne0 != src->ne[0] || ne1 != src->ne[1] || ne2 != src->ne[2] || ne3 != src->ne[3])
         throw std::runtime_error("src/dst shape mismatch");
 
-    // Normalize shifts to be within bounds
-    const int64_t sh0 = ne0 > 0 ? ((int64_t)shift0 % ne0 + ne0) % ne0 : 0;
-    const int64_t sh1 = ne1 > 0 ? ((int64_t)shift1 % ne1 + ne1) % ne1 : 0;
-    const int64_t sh2 = ne2 > 0 ? ((int64_t)shift2 % ne2 + ne2) % ne2 : 0;
-    const int64_t sh3 = ne3 > 0 ? ((int64_t)shift3 % ne3 + ne3) % ne3 : 0;
+
+    const int sh0 = ne0 > 0 ? ((int)shift0 % ne0 + ne0) % ne0 : 0;
+    const int sh1 = ne1 > 0 ? ((int)shift1 % ne1 + ne1) % ne1 : 0;
+    const int sh2 = ne2 > 0 ? ((int)shift2 % ne2 + ne2) % ne2 : 0;
+    const int sh3 = ne3 > 0 ? ((int)shift3 % ne3 + ne3) % ne3 : 0;
+
+
+    const int shNe0 = ne0 - sh0;
+    const int shNe1 = ne1 - sh1;
+    const int shNe2 = ne2 - sh2;
+    const int shNe3 = ne3 - sh3;
 
     const float *src_d = (const float*) src->data;
     float *dst_d = (float*) dst->data;
@@ -31,26 +37,26 @@ static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor
     q.submit([&](handler &h) {
         range<3> r((size_t)ne3, (size_t)ne2, (size_t)ne1);
         h.parallel_for(r, [=](id<3> idx) {
-            const int64_t i3 = idx[0];
-            const int64_t i2 = idx[1];
-            const int64_t i1 = idx[2];
+            const int i3 = (int)idx[0];
+            const int i2 = (int)idx[1];
+            const int i1 = (int)idx[2];
 
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                const int64_t idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2;
+            for (int i0 = 0; i0 < ne0; i0++) {
+                const int idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2;
 
-                // Apply shift to each dimension
-                const int64_t src_i0 = (i0 - sh0 + ne0) % ne0;
-                const int64_t src_i1 = (i1 - sh1 + ne1) % ne1;
-                const int64_t src_i2 = (i2 - sh2 + ne2) % ne2;
-                const int64_t src_i3 = (i3 - sh3 + ne3) % ne3;
 
-                const int64_t idx_src = src_i0 + src_i1 * ne0 +
+                const int src_i0 = (i0 + shNe0) % ne0;
+                const int src_i1 = (i1 + shNe1) % ne1;
+                const int src_i2 = (i2 + shNe2) % ne2;
+                const int src_i3 = (i3 + shNe3) % ne3;
+
+                const int idx_src = src_i0 + src_i1 * ne0 +
                                         src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2;
 
                 dst_d[idx_dst] = src_d[idx_src];
             }
         });
-    }).wait();
+    });
 }
 
 void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
@@ -64,11 +70,11 @@ void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
     const int shift2 = params[2];
     const int shift3 = params[3];
 
-    // Check if all shifts are zero
+
     if (shift0 == 0 && shift1 == 0 && shift2 == 0 && shift3 == 0) {
         const size_t nb = ggml_nbytes(src);
         queue *q = ctx.stream();
-        SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb).wait()));
+        SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb)));
         return;
     }
 

From b9793080f99b51fb6c6f19ae4aa6f10550eea91d Mon Sep 17 00:00:00 2001
From: tamarPal <tamarPal@example.com>
Date: Thu, 23 Oct 2025 10:08:36 +0300
Subject: [PATCH 5/5] =?UTF-8?q?fix:=20editorconfig=20=E2=80=94=20LF=20endi?=
 =?UTF-8?q?ngs=20+=20final=20newline=20for=20roll.hpp?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 ggml/src/ggml-sycl/roll.cpp | 122 +++++++++++++++++++++++-------------
 ggml/src/ggml-sycl/roll.hpp |   2 +-
 2 files changed, 79 insertions(+), 45 deletions(-)

diff --git a/ggml/src/ggml-sycl/roll.cpp b/ggml/src/ggml-sycl/roll.cpp
index f0133c365475c..1e05181789c28 100644
--- a/ggml/src/ggml-sycl/roll.cpp
+++ b/ggml/src/ggml-sycl/roll.cpp
@@ -3,58 +3,66 @@
 
 using namespace sycl;
 
-static void kernel_roll_multi_axis(queue &q, const ggml_tensor *src, ggml_tensor *dst,
-                                    int shift0, int shift1, int shift2, int shift3) {
-    if (!src || !dst) throw std::runtime_error("null tensor");
-    if (src->type != GGML_TYPE_F32 || dst->type != GGML_TYPE_F32)
-        throw std::runtime_error("only F32 supported in SYCL roll");
+static inline int wrap_add(int i, int shift, int n) {
 
-    const int ne0 = dst->ne[0];
-    const int ne1 = dst->ne[1];
-    const int ne2 = dst->ne[2];
-    const int ne3 = dst->ne[3];
+    int s = i + shift;
+    return (s >= n) ? (s - n) : s;
+}
+
+static void kernel_roll_fused_i0_i1(
+    queue &q,
+    const float *src_d,
+    float *dst_d,
+    int ne0, int ne1, int ne2, int ne3,
+    int sh0, int sh1, int sh2, int sh3)
+{
+    if (ne0 == 0 || ne1 == 0 || ne2 == 0 || ne3 == 0) return;
 
-    if (ne0 != src->ne[0] || ne1 != src->ne[1] || ne2 != src->ne[2] || ne3 != src->ne[3])
-        throw std::runtime_error("src/dst shape mismatch");
 
+    const int stride1 = ne0;
+    const int stride2 = ne0 * ne1;
+    const int stride3 = ne0 * ne1 * ne2;
 
-    const int sh0 = ne0 > 0 ? ((int)shift0 % ne0 + ne0) % ne0 : 0;
-    const int sh1 = ne1 > 0 ? ((int)shift1 % ne1 + ne1) % ne1 : 0;
-    const int sh2 = ne2 > 0 ? ((int)shift2 % ne2 + ne2) % ne2 : 0;
-    const int sh3 = ne3 > 0 ? ((int)shift3 % ne3 + ne3) % ne3 : 0;
 
+    const int shNe0 = (ne0 - sh0) % ne0;
+    const int shNe1 = (ne1 - sh1) % ne1;
+    const int shNe2 = (ne2 - sh2) % ne2;
+    const int shNe3 = (ne3 - sh3) % ne3;
 
-    const int shNe0 = ne0 - sh0;
-    const int shNe1 = ne1 - sh1;
-    const int shNe2 = ne2 - sh2;
-    const int shNe3 = ne3 - sh3;
 
-    const float *src_d = (const float*) src->data;
-    float *dst_d = (float*) dst->data;
+    const size_t g0 = (size_t) ne3;
+    const size_t g1 = (size_t) ne2;
+    const size_t g2 = (size_t) (ne1 * ne0);
 
-    if (!src_d || !dst_d) throw std::runtime_error("null data pointers");
+    const range<3> global{ g0, g1, g2 };
 
     q.submit([&](handler &h) {
-        range<3> r((size_t)ne3, (size_t)ne2, (size_t)ne1);
-        h.parallel_for(r, [=](id<3> idx) {
-            const int i3 = (int)idx[0];
-            const int i2 = (int)idx[1];
-            const int i1 = (int)idx[2];
+        h.parallel_for(global, [=](id<3> idx) {
+            const int i3 = (int) idx[0];
+            const int i2 = (int) idx[1];
+
+            const int fused = (int) idx[2];
+            const int i1 = fused / ne0;
+            const int i0 = fused - i1 * ne0;  // fused % ne0
+
 
-            for (int i0 = 0; i0 < ne0; i0++) {
-                const int idx_dst = i0 + i1 * ne0 + i2 * ne0 * ne1 + i3 * ne0 * ne1 * ne2;
+            const int idx_dst = i0
+                              + i1 * stride1
+                              + i2 * stride2
+                              + i3 * stride3;
 
 
-                const int src_i0 = (i0 + shNe0) % ne0;
-                const int src_i1 = (i1 + shNe1) % ne1;
-                const int src_i2 = (i2 + shNe2) % ne2;
-                const int src_i3 = (i3 + shNe3) % ne3;
+            const int s0 = wrap_add(i0, shNe0, ne0);
+            const int s1 = wrap_add(i1, shNe1, ne1);
+            const int s2 = wrap_add(i2, shNe2, ne2);
+            const int s3 = wrap_add(i3, shNe3, ne3);
 
-                const int idx_src = src_i0 + src_i1 * ne0 +
-                                        src_i2 * ne0 * ne1 + src_i3 * ne0 * ne1 * ne2;
+            const int idx_src = s0
+                              + s1 * stride1
+                              + s2 * stride2
+                              + s3 * stride3;
 
-                dst_d[idx_dst] = src_d[idx_src];
-            }
+            dst_d[idx_dst] = src_d[idx_src];
         });
     });
 }
@@ -63,24 +71,50 @@ void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
     const ggml_tensor *src = dst->src[0];
+    GGML_ASSERT(src && src->type == GGML_TYPE_F32);
 
-    const int32_t *params = (const int32_t *)dst->op_params;
-    const int shift0 = params[0];
-    const int shift1 = params[1];
-    const int shift2 = params[2];
-    const int shift3 = params[3];
+    const int ne0 = (int) dst->ne[0];
+    const int ne1 = (int) dst->ne[1];
+    const int ne2 = (int) dst->ne[2];
+    const int ne3 = (int) dst->ne[3];
 
+    const int32_t *params = (const int32_t *) dst->op_params;
+    int shift0 = params[0];
+    int shift1 = params[1];
+    int shift2 = params[2];
+    int shift3 = params[3];
 
-    if (shift0 == 0 && shift1 == 0 && shift2 == 0 && shift3 == 0) {
+
+    if ((shift0 | shift1 | shift2 | shift3) == 0) {
         const size_t nb = ggml_nbytes(src);
         queue *q = ctx.stream();
         SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb)));
         return;
     }
 
+    auto norm = [](int sh, int n) -> int {
+        if (n <= 0) return 0;
+        sh %= n;
+        if (sh < 0) sh += n;
+        return sh;
+    };
+    shift0 = norm(shift0, ne0);
+    shift1 = norm(shift1, ne1);
+    shift2 = norm(shift2, ne2);
+    shift3 = norm(shift3, ne3);
+
     try {
         queue *q = ctx.stream();
-        kernel_roll_multi_axis(*q, src, dst, shift0, shift1, shift2, shift3);
+
+        const float *src_d = (const float *) src->data;
+        float *dst_d = (float *) dst->data;
+        GGML_ASSERT(src_d && dst_d);
+
+        kernel_roll_fused_i0_i1(
+            *q, src_d, dst_d,
+            ne0, ne1, ne2, ne3,
+            shift0, shift1, shift2, shift3
+        );
     } catch (const std::exception &e) {
         std::fprintf(stderr, "[SYCL-ROLL] ERROR: %s\n", e.what());
         throw;
diff --git a/ggml/src/ggml-sycl/roll.hpp b/ggml/src/ggml-sycl/roll.hpp
index 6356cfe2eb306..97dc03d64b24d 100644
--- a/ggml/src/ggml-sycl/roll.hpp
+++ b/ggml/src/ggml-sycl/roll.hpp
@@ -17,4 +17,4 @@
 
 void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
 
-#endif // GGML_SYCL_ROLL_HPP
\ No newline at end of file
+#endif // GGML_SYCL_ROLL_HPP