From 6df9afda47a40903d518e7a9139ec5709b32ee92 Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Mon, 15 Sep 2025 16:14:41 +0300
Subject: [PATCH 1/3] ops: add ROUND operator support for CPU and SYCL

---
 docs/ops.md                         |  1 +
 docs/ops/CPU.csv                    |  4 +++
 docs/ops/SYCL.csv                   |  4 +++
 ggml/include/ggml.h                 |  8 ++++++
 ggml/src/ggml-cpu/ops.cpp           |  4 +++
 ggml/src/ggml-cpu/unary-ops.cpp     |  8 ++++++
 ggml/src/ggml-cpu/unary-ops.h       |  1 +
 ggml/src/ggml-sycl/element_wise.cpp | 30 ++++++++++++++++++++++
 ggml/src/ggml-sycl/element_wise.hpp |  2 ++
 ggml/src/ggml-sycl/ggml-sycl.cpp    |  4 +++
 ggml/src/ggml.c                     | 17 +++++++++++-
 tests/test-backend-ops.cpp          | 40 +++++++++++++++++++++++++++++
 12 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/docs/ops.md b/docs/ops.md
index 9a81ca0a9770c..6cb903a8ce7e7 100644
--- a/docs/ops.md
+++ b/docs/ops.md
@@ -76,6 +76,7 @@ Legend:
 |                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
diff --git a/docs/ops/CPU.csv b/docs/ops/CPU.csv
index 21e0d1b3c9117..49e2b04f69519 100644
--- a/docs/ops/CPU.csv
+++ b/docs/ops/CPU.csv
@@ -1,6 +1,8 @@
 "backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
 "CPU","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
 "CPU","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
 "CPU","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
@@ -61,6 +63,8 @@
 "CPU","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","CPU"
 "CPU","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
 "CPU","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
+"CPU","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
 "CPU","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CPU"
 "CPU","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CPU"
diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv
index 5d022ee91aa7a..226bcd28b6eb1 100644
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
@@ -1,6 +1,8 @@
 "backend_name","op_name","op_params","test_mode","supported","error_message","backend_reg_name"
 "SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","SGN","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","NEG","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
@@ -61,6 +63,8 @@
 "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL"
 "SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
+"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
 "SYCL0","SGN","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL"
 "SYCL0","NEG","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL"
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 7e9c3c8c7a096..94b19de8889e9 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -559,6 +559,7 @@ extern "C" {
 
     enum ggml_unary_op {
         GGML_UNARY_OP_ABS,
+        GGML_UNARY_OP_ROUND,
         GGML_UNARY_OP_SGN,
         GGML_UNARY_OP_NEG,
         GGML_UNARY_OP_STEP,
@@ -1027,6 +1028,13 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_abs_inplace(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
+    GGML_API struct ggml_tensor * ggml_round(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_round_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
 
     GGML_API struct ggml_tensor * ggml_sgn(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 8c1f7948855ac..c7a564e375be7 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -9336,6 +9336,10 @@ void ggml_compute_forward_unary(
             {
                 ggml_compute_forward_abs(params, dst);
             } break;
+        case GGML_UNARY_OP_ROUND:
+            {
+                ggml_compute_forward_round(params, dst);
+            } break;
         case GGML_UNARY_OP_SGN:
             {
                 ggml_compute_forward_sgn(params, dst);
diff --git a/ggml/src/ggml-cpu/unary-ops.cpp b/ggml/src/ggml-cpu/unary-ops.cpp
index 4fce569b3bfc8..0c82e82b3a7c7 100644
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -4,6 +4,10 @@ static inline float op_abs(float x) {
     return fabsf(x);
 }
 
+static inline float op_round(float x) {
+    return roundf(x);
+}
+
 static inline float op_sgn(float x) {
     return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
 }
@@ -125,6 +129,10 @@ void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor *
     unary_op<op_abs>(params, dst);
 }
 
+void ggml_compute_forward_round(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_round>(params, dst);
+}
+
 void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
     unary_op<op_sgn>(params, dst);
 }
diff --git a/ggml/src/ggml-cpu/unary-ops.h b/ggml/src/ggml-cpu/unary-ops.h
index b1ade2c8e341f..c65f97e4b2c52 100644
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@@ -7,6 +7,7 @@ extern "C" {
 #endif
 
 void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_round(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 0363b06a3ec9b..74e26b02f90e7 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -39,6 +39,11 @@ static __dpct_inline__ T op_abs(T x) {
     return sycl::fabs(x);
 }
 
+template<typename T>
+static __dpct_inline__ T op_round(T x) {
+    return sycl::round(x);
+}
+
 template<typename T>
 static __dpct_inline__ T op_elu(T x) {
     return (x > static_cast<T>(0.f)) ? x : sycl::expm1(x);
@@ -164,6 +169,13 @@ static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::n
     }
 }
 
+template<typename T>
+static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_round(x[i]);
+    }
+}
+
 template<typename T>
 static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
     SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
@@ -661,6 +673,19 @@ static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor
         });
 }
 
+static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
+        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
+            const int num_blocks = ceil_div(k_elements, 256);
+            sycl_parallel_for(stream,
+                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
+                                  sycl::range<1>(256)),
+                [=](sycl::nd_item<1> item_ct1) {
+                    unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1);
+                });
+        });
+}
+
 static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
@@ -1139,6 +1164,11 @@ void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_op_abs(ctx, dst);
 }
 
+void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
+    ggml_sycl_op_round(ctx, dst);
+}
+
 void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
     ggml_sycl_op_elu(ctx, dst);
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index 50749e87d783e..b10122af965f0 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -75,6 +75,8 @@ void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
+void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
 void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 18ff4e0b0c7cf..9e32ca7fcd3cc 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3626,6 +3626,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
                 case GGML_UNARY_OP_ABS:
                     ggml_sycl_abs(ctx, dst);
                     break;
+                case GGML_UNARY_OP_ROUND:
+                    ggml_sycl_round(ctx, dst);
+                    break;
                 case GGML_UNARY_OP_ELU:
                     ggml_sycl_elu(ctx, dst);
                     break;
@@ -4181,6 +4184,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_EXP:
                 case GGML_UNARY_OP_SGN:
                 case GGML_UNARY_OP_ABS:
+                case GGML_UNARY_OP_ROUND:
                 case GGML_UNARY_OP_ELU:
 #if defined (GGML_SYCL_F16)
                     return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d76ea58f789e2..6659a30c833b4 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1127,6 +1127,7 @@ static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
 static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
     "ABS",
+    "ROUND",
     "SGN",
     "NEG",
     "STEP",
@@ -1143,7 +1144,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
     "GELU_ERF",
 };
 
-static_assert(GGML_UNARY_OP_COUNT == 15, "GGML_UNARY_OP_COUNT != 15");
+static_assert(GGML_UNARY_OP_COUNT == 16, "GGML_UNARY_OP_COUNT != 16");
 
 
 static const char * GGML_GLU_OP_NAME[GGML_GLU_OP_COUNT] = {
@@ -2479,6 +2480,20 @@ struct ggml_tensor * ggml_abs_inplace(
     return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ABS);
 }
 
+// ggml_round
+
+struct ggml_tensor * ggml_round(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary(ctx, a, GGML_UNARY_OP_ROUND);
+}
+
+struct ggml_tensor * ggml_round_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_ROUND);
+}
+
 // ggml_sgn
 
 struct ggml_tensor * ggml_sgn(
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 3a58621094d17..1c326965eff7f 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -3555,6 +3555,45 @@ struct test_sin : public test_case {
     }
 };
 
+// GGML_OP_ROUND
+struct test_round : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne;
+
+    std::string vars() override {
+        return VARS_TO_STR2(type, ne);
+    }
+
+    test_round(ggml_type type = GGML_TYPE_F32,
+               std::array<int64_t, 4> ne = {10, 5, 4, 3})
+        : type(type), ne(ne) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_round(ctx, a);
+        ggml_set_name(out, "out");
+
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+            init_tensor_uniform(t, -10.0f, 10.0f);
+        }
+    }
+
+    float grad_eps() override {
+        return 1.0f;
+    }
+
+    bool grad_precise() override {
+        return false;
+    }
+};
+
 // GGML_OP_COS
 struct test_cos : public test_case {
     const ggml_type type;
@@ -6176,6 +6215,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         test_cases.emplace_back(new test_sqr(type));
         test_cases.emplace_back(new test_sqrt(type));
         test_cases.emplace_back(new test_log(type));
+        test_cases.emplace_back(new test_round(type));
         test_cases.emplace_back(new test_sin(type));
         test_cases.emplace_back(new test_cos(type));
         test_cases.emplace_back(new test_clamp(type));

From 7d52483ba91c46764321e5a479947882f66f6ebd Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Tue, 16 Sep 2025 14:18:37 +0300
Subject: [PATCH 2/3] fix(sycl): resolve build error by replacing
 sycl_parallel_for with stream->parallel_for

---
 ggml/src/ggml-sycl/element_wise.cpp | 60 ++++++++++++++---------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 74e26b02f90e7..89877bd353550 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -419,7 +419,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
                          const int ne12, const int nb1, const int nb2,
                          const int offset, queue_ptr stream) {
     int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE);
-    sycl_parallel_for(stream,
+    stream->paraller_for(
         sycl::nd_range<1>(sycl::range<1>(num_blocks) *
                               sycl::range<1>(SYCL_ACC_BLOCK_SIZE),
                           sycl::range<1>(SYCL_ACC_BLOCK_SIZE)),
@@ -449,7 +449,7 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
                          const int ne1, const int ne2, queue_ptr stream) {
     int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE);
     sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    sycl_parallel_for(stream,
+    stream->paraller_for(
                       sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
                                         sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
                       [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
@@ -651,7 +651,7 @@ static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -664,7 +664,7 @@ static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -677,7 +677,7 @@ static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -690,7 +690,7 @@ static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -703,7 +703,7 @@ static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -716,7 +716,7 @@ static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -729,7 +729,7 @@ static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -742,7 +742,7 @@ static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_t
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -755,7 +755,7 @@ static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -768,7 +768,7 @@ static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -781,7 +781,7 @@ static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggm
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -794,7 +794,7 @@ static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -807,7 +807,7 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -820,7 +820,7 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -833,7 +833,7 @@ static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -846,7 +846,7 @@ static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -859,7 +859,7 @@ static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_te
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -872,7 +872,7 @@ static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -885,7 +885,7 @@ static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -898,7 +898,7 @@ static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -913,7 +913,7 @@ static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) {
             const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -926,7 +926,7 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -960,7 +960,7 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) {
             const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE);
-            sycl_parallel_for(stream,
+            stream->paraller_for(
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -992,7 +992,7 @@ static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(main_stream,
+            main_stream->paraller_for(
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1003,7 +1003,7 @@ static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
-            sycl_parallel_for(main_stream,
+            main_stream->paraller_for(
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1014,7 +1014,7 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
-            sycl_parallel_for(main_stream,
+            main_stream->paraller_for(
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1025,7 +1025,7 @@ static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(main_stream,
+            main_stream->paraller_for(
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1036,7 +1036,7 @@ static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggm
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            sycl_parallel_for(main_stream,
+            main_stream->paraller_for(
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });

From eeb54fe035ac287f6c38ad39601791176fb3536e Mon Sep 17 00:00:00 2001
From: safranowith <bsh155762@gmail.com>
Date: Tue, 16 Sep 2025 14:28:24 +0300
Subject: [PATCH 3/3] Conflict resolution

---
 ggml/src/ggml-sycl/element_wise.cpp | 64 ++++++++++++++---------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index 89877bd353550..25b27d053e973 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -419,7 +419,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
                          const int ne12, const int nb1, const int nb2,
                          const int offset, queue_ptr stream) {
     int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE);
-    stream->paraller_for(
+    sycl_parallel_for(stream,
         sycl::nd_range<1>(sycl::range<1>(num_blocks) *
                               sycl::range<1>(SYCL_ACC_BLOCK_SIZE),
                           sycl::range<1>(SYCL_ACC_BLOCK_SIZE)),
@@ -449,7 +449,7 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
                          const int ne1, const int ne2, queue_ptr stream) {
     int num_blocks = ceil_div(ne0, SYCL_PAD_BLOCK_SIZE);
     sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->paraller_for(
+    sycl_parallel_for(stream,
                       sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
                                         sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
                       [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
@@ -651,7 +651,7 @@ static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -664,7 +664,7 @@ static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -677,7 +677,7 @@ static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -690,7 +690,7 @@ static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, 256);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                   sycl::range<1>(256)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -703,7 +703,7 @@ static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -716,7 +716,7 @@ static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -729,7 +729,7 @@ static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -742,7 +742,7 @@ static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_t
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -755,7 +755,7 @@ static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -768,7 +768,7 @@ static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -781,7 +781,7 @@ static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggm
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -794,7 +794,7 @@ static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -807,7 +807,7 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -820,7 +820,7 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -833,7 +833,7 @@ static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -846,7 +846,7 @@ static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -859,7 +859,7 @@ static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_te
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -872,7 +872,7 @@ static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tenso
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -885,7 +885,7 @@ static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -898,7 +898,7 @@ static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -913,7 +913,7 @@ static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) {
             const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -926,7 +926,7 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
             const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -960,7 +960,7 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
         [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) {
             const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE);
-            stream->paraller_for(
+            sycl_parallel_for(stream,
                 sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE),
                                   sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)),
                 [=](sycl::nd_item<1> item_ct1) {
@@ -992,9 +992,9 @@ static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            main_stream->paraller_for(
-                    sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
-                gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
+                sycl_parallel_for(main_stream, 
+                       sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
+                    gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
         });
 }
@@ -1003,7 +1003,7 @@ static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tens
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
-            main_stream->paraller_for(
+            sycl_parallel_for(main_stream,
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1014,7 +1014,7 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu
-            main_stream->paraller_for(
+            sycl_parallel_for(main_stream,
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1025,7 +1025,7 @@ static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            main_stream->paraller_for(
+            sycl_parallel_for(main_stream,
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });
@@ -1036,7 +1036,7 @@ static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggm
     ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
         [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
             const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
-            main_stream->paraller_for(
+            sycl_parallel_for(main_stream,
                     sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) {
                 gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1);
             });