diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 5ce349a880edc..6b0bebf9c70c9 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -9,9 +9,6 @@ #include "ggml.h" #include "ggml-backend.h" -#define GGML_SYCL_NAME "SYCL" -#define GGML_SYCL_MAX_DEVICES 48 - #ifdef __cplusplus extern "C" { #endif diff --git a/ggml/src/ggml-sycl/argmax.cpp b/ggml/src/ggml-sycl/argmax.cpp new file mode 100644 index 0000000000000..b7d0471f8b06c --- /dev/null +++ b/ggml/src/ggml-sycl/argmax.cpp @@ -0,0 +1,75 @@ +#include "argmax.hpp" + +static void argmax_f32_i32_sycl(const float * x, int * dst, const int ncols, const int nrows, queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = 256 * sizeof(float); + + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor shared_data(sycl::range<1>(shared_mem / sizeof(float)), cgh); + sycl::local_accessor shared_indices(sycl::range<1>(shared_mem / sizeof(float)), cgh); + + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + const int tid = item_ct1.get_local_id(2); + const int row = item_ct1.get_global_id(1); + + float max_val = -INFINITY; + int max_idx = -1; + + for (int col = tid; col < ncols; col += 256) { + float val = x[row * ncols + col]; + if (val > max_val) { + max_val = val; + max_idx = col; + } + } + + shared_data[tid] = max_val; + shared_indices[tid] = max_idx; + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int stride = 256 / 2; stride > 0; stride >>= 1) { + if (tid < stride) { + float val1 = shared_data[tid]; + float val2 = shared_data[tid + stride]; + if (val2 > val1) { + shared_data[tid] = val2; + shared_indices[tid] = shared_indices[tid + stride]; + } + } + item_ct1.barrier(sycl::access::fence_space::local_space); + } + + if (tid == 0) { + dst[row] = shared_indices[0]; + } + }); + }); +} + +static void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_I32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + int32_t * dst_dd = static_cast(dst->data); + argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_argmax(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/argmax.hpp b/ggml/src/ggml-sycl/argmax.hpp new file mode 100644 index 0000000000000..431a7d6e71b0d --- /dev/null +++ b/ggml/src/ggml-sycl/argmax.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_ARGMAX_HPP +#define GGML_SYCL_ARGMAX_HPP + +#include "common.hpp" + +void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ARGMAX_HPP diff --git a/ggml/src/ggml-sycl/argsort.cpp b/ggml/src/ggml-sycl/argsort.cpp new file mode 100644 index 0000000000000..282ee6cc6273b --- /dev/null +++ b/ggml/src/ggml-sycl/argsort.cpp @@ -0,0 +1,130 @@ +#include "argsort.hpp" + +static int next_power_of_2(int x) { + int n = 1; + while (n < x) { + n *= 2; + } + return n; +} + +template +static inline void ggml_sycl_swap(T & a, T & b) { + T tmp = a; + a = b; + b = tmp; +} + +template +__dpct_inline__ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad, + const sycl::nd_item<3> & item_ct1, uint8_t * dpct_local) { + // bitonic sort + int col = item_ct1.get_local_id(2); + int row = item_ct1.get_group(1); + + if (col >= ncols_pad) { + return; + } + + const float * x_row = x + row * ncols; + auto dst_row = (int *) dpct_local; + + // initialize indices + dst_row[col] = col; + + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int k = 2; k <= ncols_pad; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && + (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]]))) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); + } + } else { + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && + (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]]))) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); + } + } + } + /* + DPCT1118:1: SYCL group functions and algorithms must be encountered + in converged control flow. You may need to adjust the code. + */ + item_ct1.barrier(sycl::access::fence_space::local_space); + } + } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } +} + +static void argsort_f32_i32_sycl(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, + queue_ptr stream) { + // bitonic sort requires ncols to be power of 2 + const int ncols_pad = next_power_of_2(ncols); + + const sycl::range<3> block_dims(1, 1, ncols_pad); + const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + if (order == GGML_SORT_ORDER_ASC) { + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor dpct_local_acc_ct1(sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr().get()); + }); + }); + } else if (order == GGML_SORT_ORDER_DESC) { + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor dpct_local_acc_ct1(sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr().get()); + }); + }); + } else { + GGML_ABORT("fatal error"); + } +} + +inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_I32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + int32_t * dst_dd = static_cast(dst->data); + + argsort_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, order, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_argsort(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/argsort.hpp b/ggml/src/ggml-sycl/argsort.hpp new file mode 100644 index 0000000000000..e79d20e8a7592 --- /dev/null +++ b/ggml/src/ggml-sycl/argsort.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_ARGSORT_HPP +#define GGML_SYCL_ARGSORT_HPP + +#include "common.hpp" + +void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ARGSORT_HPP diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index b1df4e5db1753..965bde36ebf8f 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -29,6 +29,16 @@ #include "wkv6.hpp" #include "outprod.hpp" #include "element_wise.hpp" +#include "binbcast.hpp" +#include "argmax.hpp" +#include "argsort.hpp" +#include "cpy.hpp" +#include "getrows.hpp" +#include "diagmask.hpp" +#include "scale.hpp" +#include "clamp.hpp" +#include "pool2d.hpp" +#include "sum.hpp" #include "gla.hpp" #endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp new file mode 100644 index 0000000000000..8fc6e1b56b22f --- /dev/null +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -0,0 +1,339 @@ +#include "binbcast.hpp" + +static __dpct_inline__ float op_repeat(const float a, const float b) { + return b; + GGML_UNUSED(a); +} + +static __dpct_inline__ float op_add(const float a, const float b) { + return a + b; +} + +static __dpct_inline__ float op_sub(const float a, const float b) { + return a - b; +} + +static __dpct_inline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __dpct_inline__ float op_div(const float a, const float b) { + return a / b; +} + +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, const sycl::nd_item<3> & item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + item_ct1.get_local_id(0)) / ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + item_ct1.get_local_id(0)) % ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3 * s3 + i2 * s2 + i1 * s1; + const size_t i_src1 = i13 * s13 + i12 * s12 + i11 * s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t) bin_op(src0 ? (float) src0_row[i0] : 0.0f, (float) src1_row[i10]); + } +} + +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, int ne0, int ne1, int ne2, + int ne3, int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s10,*/ int s11, int s12, int s13, const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + const int i3 = i / (ne2 * ne1 * ne0); + const int i2 = (i / (ne1 * ne0)) % ne2; + const int i1 = (i / ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3 * s3 + i2 * s2 + i1 * s1; + const size_t i_src1 = i13 * s13 + i12 * s12 + i11 * s11; + const size_t i_dst = i_src0; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t) bin_op(src0 ? (float) src0_row[i0] : 0.0f, (float) src1_row[i10]); +} + +template struct bin_bcast_sycl { + template + void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, + const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, queue_ptr stream) { + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10 / ne0; + int nr1 = ne11 / ne1; + int nr2 = ne12 / ne2; + int nr3 = ne13 / ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne0[] = { ne0, ne1, ne2, ne3 }; + int64_t cne1[] = { ne10, ne11, ne12, ne13 }; + size_t cnb0[] = { nb0, nb1, nb2, nb3 }; + size_t cnb1[] = { nb10, nb11, nb12, nb13 }; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne0); + collapse(cne1); + } + } + { + int64_t ne0 = cne0[0]; + int64_t ne1 = cne0[1]; + int64_t ne2 = cne0[2]; + int64_t ne3 = cne0[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb0[0]; + size_t nb1 = cnb0[1]; + size_t nb2 = cnb0[2]; + size_t nb3 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0 / 2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = std::min(ne1, block_size / (unsigned int) block_dims[2]); + block_dims[0] = std::min(std::min(ne2 * ne3, block_size / (unsigned int) block_dims[2] / + (unsigned int) block_dims[1]), + 64U); + + sycl::range<3> block_nums((ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, + ne13, s1, s2, s3, s11, s12, s13, item_ct1); + }); + } + } else { + /* + DPCT1049:16: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, + ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1); + }); + } + } + } +}; + +template +inline void ggml_sycl_op_bin_bcast(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const void * src1_dd, void * dst_dd, + const queue_ptr & main_stream) { + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const float *) src0_dd, (const float *) src1_dd, (float *) dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(src0, src1, dst, (const sycl::half *) src0_dd, (const float *) src1_dd, (sycl::half *) dst_dd, + main_stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const sycl::half *) src0_dd, (const float *) src1_dd, (float *) dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + op()(src0, src1, dst, (const int32_t *) src0_dd, (const int32_t *) src1_dd, (int32_t *) dst_dd, main_stream); + } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { + op()(src0, src1, dst, (const int16_t *) src0_dd, (const int16_t *) src1_dd, (int16_t *) dst_dd, main_stream); + } else { + GGML_LOG_ERROR("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} + +inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const void * src0_dd = static_cast(dst->src[0]->data); + const void * src1_dd = static_cast(dst->src[1]->data); + void * dst_dd = static_cast(dst->data); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + ggml_sycl_op_bin_bcast>(dst->src[0], dst->src[1], dst, src0_dd, src1_dd, dst_dd, + main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const void * src0_d = static_cast(dst->src[0]->data); + void * dst_d = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + ggml_sycl_op_bin_bcast>(dst, dst->src[0], dst, nullptr, src0_d, dst_d, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_add(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_sub(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_mul(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_div(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_repeat(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/binbcast.hpp b/ggml/src/ggml-sycl/binbcast.hpp new file mode 100644 index 0000000000000..db8c8f55340a9 --- /dev/null +++ b/ggml/src/ggml-sycl/binbcast.hpp @@ -0,0 +1,16 @@ +#ifndef GGML_SYCL_BINBCAST_HPP +#define GGML_SYCL_BINBCAST_HPP + +#include "common.hpp" + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_BINBCAST_HPP diff --git a/ggml/src/ggml-sycl/clamp.cpp b/ggml/src/ggml-sycl/clamp.cpp new file mode 100644 index 0000000000000..35eb8deca480d --- /dev/null +++ b/ggml/src/ggml-sycl/clamp.cpp @@ -0,0 +1,52 @@ +#include "clamp.hpp" + +static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, + const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); +} + +static void clamp_f32_sycl(const float * x, float * dst, const float min, const float max, const int k, + queue_ptr stream) { + const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { clamp_f32(x, dst, min, max, k, item_ct1); }); +} + +inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), main_stream); + /* + DPCT1010:88: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + SYCL_CHECK(0); + */ +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_clamp(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/clamp.hpp b/ggml/src/ggml-sycl/clamp.hpp new file mode 100644 index 0000000000000..fdfbff55b553c --- /dev/null +++ b/ggml/src/ggml-sycl/clamp.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_CLAMP_HPP +#define GGML_SYCL_CLAMP_HPP + +#include "common.hpp" + +void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CLAMP_HPP diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 022e7b7637bd3..e425f5ec6e618 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -12,9 +12,6 @@ #include "common.hpp" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" - int get_current_device_id() { return dpct::dev_mgr::instance().current_device_id(); } @@ -55,6 +52,16 @@ bool gpu_has_xmx(sycl::device &dev) { return dev.has(sycl::aspect::ext_intel_matrix); } +const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return GGML_SYCL_NAME "_Split"; + + GGML_UNUSED(buft); +} + +bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { + return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name; +} + int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { const int64_t max_range = std::numeric_limits::max(); int64_t sycl_down_blk_size = block_size; @@ -65,37 +72,3 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block } return sycl_down_blk_size; } - -void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const ggml_sycl_op_flatten_t op) try { - - const bool use_src1 = src1 != nullptr; - if(use_src1) - GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); - - // dd = data device - float * src0_ddf = (float *) src0->data; - float * src1_ddf = use_src1 ? (float *) src1->data : nullptr; - float * dst_ddf = (float *) dst->data; - - ggml_sycl_pool_alloc src0_f(ctx.pool()); - ggml_sycl_pool_alloc src1_f(ctx.pool()); - ggml_sycl_pool_alloc dst_f(ctx.pool()); - - ggml_sycl_set_device(ctx.device); - queue_ptr main_stream = ctx.stream(); - // GGML_SYCL_DEBUG("ctx.device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n", - // ctx.device, main_stream, src0_on_device, src1_on_device, dst_on_device); - - // do the computation - op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); - // print_ggml_tensor("tensor", dst); -} -catch (sycl::exception const &exc) { - - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index abad847ca8199..faf25dd9c198d 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -17,7 +17,6 @@ #include #include "dpct/helper.hpp" -#include "ggml-sycl.h" #include "presets.hpp" #if GGML_SYCL_DNNL #include "dnnl.hpp" @@ -31,11 +30,17 @@ #pragma clang diagnostic ignored "-Wnested-anon-types" #include "ggml-common.h" #pragma clang diagnostic pop +#include +#include + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" +#include "ggml.h" void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); -static int g_ggml_sycl_debug = 0; +extern int g_ggml_sycl_debug; #define GGML_SYCL_DEBUG(...) \ do { \ if (g_ggml_sycl_debug) \ @@ -85,6 +90,9 @@ static int g_ggml_sycl_debug = 0; #define GGML_SYCL_MMV_Y 1 #endif +#define GGML_SYCL_NAME "SYCL" +#define GGML_SYCL_MAX_DEVICES 48 + typedef sycl::queue *queue_ptr; enum ggml_sycl_backend_gpu_mode { @@ -427,258 +435,21 @@ typedef void (*ggml_sycl_op_flatten_t)(ggml_backend_sycl_context & ctx, const gg const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); -template -static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) / - ne3; - const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) % - ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - for (int i0 = i0s; i0 < ne0; - i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); - } -} - -template -static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; - - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); -} - - -template -struct bin_bcast_sycl { - template - void operator()(ggml_backend_sycl_context & ctx, - const struct ggml_tensor *src0, - const struct ggml_tensor *src1, struct ggml_tensor *dst, - const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd, - queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - int nr0 = ne10/ne0; - int nr1 = ne11/ne1; - int nr2 = ne12/ne2; - int nr3 = ne13/ne3; - - int nr[4] = { nr0, nr1, nr2, nr3 }; - - // collapse dimensions until first broadcast dimension - int64_t cne0[] = {ne0, ne1, ne2, ne3}; - int64_t cne1[] = {ne10, ne11, ne12, ne13}; - size_t cnb0[] = {nb0, nb1, nb2, nb3}; - size_t cnb1[] = {nb10, nb11, nb12, nb13}; - auto collapse = [](int64_t cne[]) { - cne[0] *= cne[1]; - cne[1] = cne[2]; - cne[2] = cne[3]; - cne[3] = 1; - }; - - auto collapse_nb = [](size_t cnb[], int64_t cne[]) { - cnb[1] *= cne[1]; - cnb[2] *= cne[2]; - cnb[3] *= cne[3]; - }; - - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne0); - collapse(cne1); - } - } - { - int64_t ne0 = cne0[0]; - int64_t ne1 = cne0[1]; - int64_t ne2 = cne0[2]; - int64_t ne3 = cne0[3]; - - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - - size_t nb0 = cnb0[0]; - size_t nb1 = cnb0[1]; - size_t nb2 = cnb0[2]; - size_t nb3 = cnb0[3]; - - size_t nb10 = cnb1[0]; - size_t nb11 = cnb1[1]; - size_t nb12 = cnb1[2]; - size_t nb13 = cnb1[3]; - - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - size_t s10 = nb10 / sizeof(src1_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - GGML_ASSERT(s0 == 1); - GGML_ASSERT(s10 == 1); - - const int block_size = 128; - - int64_t hne0 = std::max(ne0/2LL, 1LL); - - sycl::range<3> block_dims(1, 1, 1); - block_dims[2] = std::min(hne0, block_size); - block_dims[1] = std::min( - ne1, block_size / (unsigned int)block_dims[2]); - block_dims[0] = std::min( - std::min( - ne2 * ne3, block_size / (unsigned int)block_dims[2] / - (unsigned int)block_dims[1]), - 64U); - - sycl::range<3> block_nums( - (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], - (ne1 + block_dims[1] - 1) / block_dims[1], - (hne0 + block_dims[2] - 1) / block_dims[2]); - - if (block_nums[0] > 65535) { - // this is the maximum number of blocks in z direction, fallback to 1D grid kernel - int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * - sycl::range<3>(1, 1, block_size), - sycl::range<3>(1, 1, block_size)), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast_unravel( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, - s13, item_ct1); - }); - } - } else { - /* - DPCT1049:16: The work-group size passed to the SYCL kernel may - exceed the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if - needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, - ne2, ne3, ne10, ne11, ne12, ne13, - s1, s2, s3, s11, s12, s13, - item_ct1); - }); - } - } - GGML_UNUSED(ctx); - } -}; +bool gpu_has_xmx(sycl::device &dev); +const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft); +bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer); -template -inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, - (sycl::half *)dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - op()(ctx, src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd, - main_stream); - } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { - op()(ctx, src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd, - main_stream); - } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { - op()(ctx, src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd, - main_stream); - } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, - ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} +// Some backend specific macros +#define GGML_SYCL_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, dst->src[0], ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, dst->src[0], nb) GGML_TENSOR_LOCALS(int64_t, ne1, dst->src[1], ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, dst->src[1], nb) GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) -bool gpu_has_xmx(sycl::device &dev); +#define GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) -void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const ggml_sycl_op_flatten_t op); #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index d41cfd3a6ec88..c69e109d86e9a 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -158,34 +158,35 @@ static void concat_f32_sycl_non_cont( }); } -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - queue_ptr stream = ctx.stream(); - - const int32_t dim = ((int32_t *)dst->op_params)[0]; - - if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { - const float *src0_d = (const float *)src0->data; - const float *src1_d = (const float *)src1->data; - - float *dst_d = (float *)dst->data; - - if (dim != 3) { - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_sycl( - src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), - dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], - src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); - } - } else { - const size_t size0 = ggml_nbytes(src0); - const size_t size1 = ggml_nbytes(src1); - - SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); - SYCL_CHECK(CHECK_TRY_ERROR( - stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); - } +static void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + const int32_t dim = ((int32_t *) dst->op_params)[0]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + + float * dst_d = (float *) dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), + dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0], + dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); + } } else concat_f32_sycl_non_cont( stream, (const char *)src0->data, (const char *)src1->data, @@ -194,4 +195,13 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_concat(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp index e5cb7314c9f33..8a1c5596e3654 100644 --- a/ggml/src/ggml-sycl/concat.hpp +++ b/ggml/src/ggml-sycl/concat.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst); +void ggml_sycl_concat(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_CONCAT_HPP diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index ddba601e10fcc..7d0bb730a19b4 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -71,7 +71,9 @@ static void conv_transpose_1d_f32_f32_sycl( }); } -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { +static void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; const float * src0_d = (const float *)src0->data; @@ -79,6 +81,7 @@ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor float * dst_d = (float *)dst->data; dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -96,5 +99,13 @@ void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor src0->ne[0], src0->ne[1], src0->ne[2], src1->ne[0], dst->ne[0], src0_d, src1_d, dst_d, stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } +void ggml_sycl_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_conv_transpose_1d(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp index f9e60dc758029..d5d69b3f86cbd 100644 --- a/ggml/src/ggml-sycl/conv.hpp +++ b/ggml/src/ggml-sycl/conv.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst); +void ggml_sycl_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp new file mode 100644 index 0000000000000..1559db5dc1372 --- /dev/null +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -0,0 +1,401 @@ +#include "cpy.hpp" + +#include + +static void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = sycl::vec(*xi).convert()[0]; +} + +static void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i16_i16(const char * cxi, char * cdsti) { + const int16_t * xi = (const int16_t *) cxi; + int16_t * dsti = (int16_t *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i32_i32(const char * cxi, char * cdsti) { + const int32_t * xi = (const int32_t *) cxi; + int32_t * dsti = (int32_t *) cdsti; + + *dsti = *xi; +} + +template +static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float) v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j] * id; + + dsti->qs[j] = sycl::round((float) x0); + } +} + +static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) { + vmin = v; + } + if (v > vmax) { + vmax = v; + } + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1 / 2; ++j) { + const float x0 = (xi[0 + j] - vmin) * id; + const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +template +static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer)); + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + GGML_SYCL_TENSOR_BINARY_OP_CP_LOCALS + + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + queue_ptr main_stream = ctx.stream(); + + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; + GGML_SYCL_DEBUG("%s: type combination supplied: %s to %s\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { + ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else { + GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + // TODO: why do we pass dst as src1 here? + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_cpy(ctx, dst->src[0], dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp new file mode 100644 index 0000000000000..0a0f561d2309a --- /dev/null +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -0,0 +1,11 @@ +#ifndef GGML_SYCL_CPY_HPP +#define GGML_SYCL_CPY_HPP + +#include "common.hpp" + +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); +void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CPY_HPP diff --git a/ggml/src/ggml-sycl/diagmask.cpp b/ggml/src/ggml-sycl/diagmask.cpp new file mode 100644 index 0000000000000..1ec00194dda07 --- /dev/null +++ b/ggml/src/ggml-sycl/diagmask.cpp @@ -0,0 +1,54 @@ +#include "diagmask.hpp" +#include + +static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, + const int n_past, const sycl::nd_item<3> & item_ct1) { + const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1); + const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (col >= ncols) { + return; + } + + const int i = row * ncols + col; + //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; + //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +} + +static void diag_mask_inf_f32_sycl(const float * x, float * dst, const int ncols_x, const int nrows_x, + const int rows_per_channel, const int n_past, queue_ptr stream) { + const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE; + const sycl::range<3> block_nums(1, block_num_x, nrows_x); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + diag_mask_inf_f32(x, dst, ncols_x, rows_per_channel, n_past, item_ct1); + }); +} + +inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t ne01 = dst->src[0]->ne[1]; + const int nrows0 = ggml_nrows(dst->src[0]); + + const int n_past = ((int32_t *) dst->op_params)[0]; + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_diag_mask_inf(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/diagmask.hpp b/ggml/src/ggml-sycl/diagmask.hpp new file mode 100644 index 0000000000000..8a42a4e0899ba --- /dev/null +++ b/ggml/src/ggml-sycl/diagmask.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_DIAG_MASK +#define GGML_SYCL_DIAG_MASK + +#include "common.hpp" + +void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_DIAG_MASK diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 0d097357ce79b..224854307337e 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -973,6 +973,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( } #else const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion + GGML_UNUSED(ctx); #endif // GGML_SYCL_F16 switch (src0->type) { diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 4bcd74376eaac..f1d9948447ba6 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,4 +1,3 @@ -#include "common.hpp" #include "element_wise.hpp" void acc_f32(const float * x, const float * y, float * dst, const int ne, @@ -509,522 +508,466 @@ void pad_f32_sycl(const float *x, float *dst, const int ne00, }); } -inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); +inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); +inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + exp_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - log_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + log_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + sigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { +inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + sqrt_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + sin_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + cos_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor *dst) try { - step_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + step_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + neg_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); float negative_slope; memcpy(&negative_slope, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); - leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), negative_slope, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); +inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); + sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(dst->src[0]), main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const float sf0 = (float)dst->ne[0]/dst->src[0]->ne[0]; + const float sf1 = (float)dst->ne[1]/dst->src[0]->ne[1]; + const float sf2 = (float)dst->ne[2]/dst->src[0]->ne[2]; + const float sf3 = (float)dst->ne[3]/dst->src[0]->ne[3]; - const float sf0 = (float)dst->ne[0]/src0->ne[0]; - const float sf1 = (float)dst->ne[1]/src0->ne[1]; - const float sf2 = (float)dst->ne[2]/src0->ne[2]; - const float sf3 = (float)dst->ne[3]/src0->ne[3]; + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - upscale_f32_sycl(src0_dd, dst_dd, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + upscale_f32_sycl(src0_dd, dst_dd, dst->src[0]->nb[0], dst->src[0]->nb[1], dst->src[0]->nb[2], dst->src[0]->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + GGML_ASSERT(dst->src[0]->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); pad_f32_sycl(src0_dd, dst_dd, - src0->ne[0], src0->ne[1], src0->ne[2], + dst->src[0]->ne[0], dst->src[0]->ne[1], dst->src[0]->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, + ggml_tensor *dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused int offset = dst->op_params[3] / 4; // offset in bytes - acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), dst->src[1]->ne[0], dst->src[1]->ne[1], dst->src[1]->ne[2], nb1, nb2, offset, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } - void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqrt); + ggml_sycl_op_sqrt(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sin); + ggml_sycl_op_sin(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_cos); + ggml_sycl_op_cos(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_acc); + ggml_sycl_op_acc(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu); + ggml_sycl_op_gelu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_silu); + ggml_sycl_op_silu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_gelu_quick); + ggml_sycl_op_gelu_quick(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_tanh); + ggml_sycl_op_tanh(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_relu); + ggml_sycl_op_relu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sigmoid); + ggml_sycl_op_sigmoid(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardsigmoid); + ggml_sycl_op_hardsigmoid(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_hardswish); + ggml_sycl_op_hardswish(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } - void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_exp); + ggml_sycl_op_exp(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_log); + ggml_sycl_op_log(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_neg); + ggml_sycl_op_neg(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_step); + ggml_sycl_op_step(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_leaky_relu); + ggml_sycl_op_leaky_relu(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sqr); + ggml_sycl_op_sqr(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_upscale); + ggml_sycl_op_upscale(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pad); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - - - -void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_add); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sub); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_mul); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_div); + ggml_sycl_op_pad(ctx, dst); GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 46443264505cc..6c3c3eef8455a 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -3,28 +3,6 @@ #include "common.hpp" -static __dpct_inline__ float op_repeat(const float a, const float b) { - return b; - GGML_UNUSED(a); -} - -static __dpct_inline__ float op_add(const float a, const float b) { - return a + b; -} - -static __dpct_inline__ float op_sub(const float a, const float b) { - return a - b; -} - -static __dpct_inline__ float op_mul(const float a, const float b) { - return a * b; -} - -static __dpct_inline__ float op_div(const float a, const float b) { - return a / b; -} - - void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); @@ -65,12 +43,4 @@ void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - #endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index 3f0f34ad603f5..a43099680a4be 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -16,8 +16,6 @@ #include #include -#include "ggml-sycl.h" - #if GGML_SYCL_DNNL #include "dnnl.hpp" diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp new file mode 100644 index 0000000000000..00fe37d8e1c51 --- /dev/null +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -0,0 +1,176 @@ +#include "getrows.hpp" +#include "dequantize.hpp" + +template +static void k_get_rows(const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> & item_ct1 /*, size_t s13*/) { + const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2)) * 2; + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) / ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) % ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10 * s10 + i11 * s11 + i12 * s12]; + + dst_t * dst_row = dst + i10 * s1 + i11 * s2 + i12 * s3; + const void * src0_row = (const char *) src0 + i01 * nb01 + i11 * nb02 + i12 * nb03; + + const int ib = i00 / qk; // block index + const int iqs = (i00 % qk) / qr; // quant index + const int iybs = i00 - i00 % qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk / 2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x(); + dst_row[iybs + iqs + y_offset] = v.y(); +} + +template +static void k_get_rows_float(const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> & item_ct1 /*, size_t s13*/) { + const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2); + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) / ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + item_ct1.get_local_id(0)) % ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10 * s10 + i11 * s11 + i12 * s12]; + + dst_t * dst_row = dst + i10 * s1 + i11 * s2 + i12 * s3; + const src0_t * src0_row = (const src0_t *) ((const char *) src0 + i01 * nb01 + i11 * nb02 + i12 * nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void get_rows_sycl(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + 2 * SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2 * SYCL_GET_ROWS_BLOCK_SIZE); + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(dst->src[1]); + const size_t s11 = nb11 / ggml_element_size(dst->src[1]); + const size_t s12 = nb12 / ggml_element_size(dst->src[1]); + //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); + + GGML_ASSERT(ne00 % 2 == 0); + const void * src0_dd = dst->src[0]->data; + const int32_t * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_get_rows(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, + item_ct1); + }); +} + +template static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(dst->src[1]); + const size_t s11 = nb11 / ggml_element_size(dst->src[1]); + const size_t s12 = nb12 / ggml_element_size(dst->src[1]); + //const size_t s13 = nb13 / ggml_element_size(dst->src[1]); + const src0_t * src0_dd = static_cast(dst->src[0]->data); + const int32_t * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { + k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, + item_ct1); + }); + } +} + +static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); + GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + switch (dst->src[0]->type) { + case GGML_TYPE_F16: + get_rows_sycl_float(ctx, dst); + break; + case GGML_TYPE_F32: + get_rows_sycl_float(ctx, dst); + break; + case GGML_TYPE_Q4_0: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q4_1: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q5_0: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q5_1: + get_rows_sycl(ctx, dst); + break; + case GGML_TYPE_Q8_0: + get_rows_sycl(ctx, dst); + break; + default: + // TODO: k-quants + GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type)); + GGML_ABORT("fatal error"); + break; + } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_get_rows(ctx, dst); + GGML_SYCL_DEBUG("call %s\n", __func__); +} diff --git a/ggml/src/ggml-sycl/getrows.hpp b/ggml/src/ggml-sycl/getrows.hpp new file mode 100644 index 0000000000000..7dc21b99242b1 --- /dev/null +++ b/ggml/src/ggml-sycl/getrows.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_GETROWS_HPP +#define GGML_SYCL_GETROWS_HPP + +#include "common.hpp" + +void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_GETROWS_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 2984ed82e8a7c..ca92e966c3fb4 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -29,18 +29,14 @@ #include #include -#include -#include - #include "ggml-sycl.h" -#include "ggml-impl.h" -#include "ggml-backend-impl.h" - +#include "common.hpp" #include "ggml-sycl/backend.hpp" -#include "ggml-sycl/presets.hpp" #include "ggml-sycl/gemm.hpp" +#include "ggml.h" static bool g_sycl_loaded = false; +int g_ggml_sycl_debug = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -103,11 +99,10 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); auto global_mem_size = prop.get_global_mem_size()/1000000; - std::string xmx = gpu_has_xmx(device) ? "yes" : "no"; - GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|%14s|\n", id, device_type.c_str(), + GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), name.c_str(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - global_mem_size, device.get_info().c_str(), xmx.c_str()); + global_mem_size, device.get_info().c_str()); } void ggml_backend_sycl_print_sycl_devices() { @@ -118,16 +113,16 @@ void ggml_backend_sycl_print_sycl_devices() { GGML_LOG_INFO( "| | | | " - " |Max | |Max |Global | | XMX |\n"); + " |Max | |Max |Global | |\n"); GGML_LOG_INFO( "| | | | " - " |compute|Max work|sub |mem | | or |\n"); + " |compute|Max work|sub |mem | |\n"); GGML_LOG_INFO( "|ID| Device Type| " - "Name|Version|units |group |group|size | Driver version| Tensor Cores |\n"); + "Name|Version|units |group |group|size | Driver version|\n"); GGML_LOG_INFO( "|--|-------------------|---------------------------------------|------" - "-|-------|--------|-----|-------|---------------------|--------------|\n"); + "-|-------|--------|-----|-------|---------------------|\n"); for (int id = 0; id < device_count; ++id) { sycl::device device = dpct::dev_mgr::instance().get_device(id); @@ -158,8 +153,8 @@ static void ggml_check_sycl() try { static bool initialized = false; if (!initialized) { - GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); #if defined(GGML_SYCL_FORCE_MMQ) GGML_LOG_INFO("GGML_SYCL_FORCE_MMQ: yes\n"); @@ -314,6 +309,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -338,6 +334,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -366,6 +363,7 @@ static bool ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *src, ggml_tensor *dst) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); if (ggml_backend_buffer_is_sycl(src->buffer)) { ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context; @@ -423,7 +421,8 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; ggml_sycl_set_device(ctx->device); queue_ptr stream = ctx->stream; @@ -470,6 +469,7 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; ggml_sycl_set_device(buft_ctx->device); const queue_ptr stream = buft_ctx->stream; @@ -713,6 +713,7 @@ static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buff static void ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; @@ -796,6 +797,7 @@ static void ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -849,6 +851,7 @@ static void ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); // split tensors must always be set in their entirety at once GGML_ASSERT(offset == 0); GGML_ASSERT(size == ggml_nbytes(tensor)); @@ -899,6 +902,7 @@ catch (sycl::exception const &exc) { } static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); GGML_UNUSED(buffer); GGML_UNUSED(value); } @@ -916,17 +920,6 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { }; // sycl split buffer type - -static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return GGML_SYCL_NAME "_Split"; - - GGML_UNUSED(buft); -} - -static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { - return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name; -} - static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point // instead, we allocate them for each tensor separately in init_tensor @@ -1033,10 +1026,12 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ } static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); ggml_sycl_host_free(buffer->context); } static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_SYCL_DEBUG("[SYCL] call %s", __func__); void * ptr = ggml_sycl_host_malloc(size); if (ptr == nullptr) { @@ -1264,8 +1259,6 @@ std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(q // struct ggml_sycl_pool_vmm : public ggml_sycl_pool /// kernels - -typedef void (*cpy_kernel_t)(const char * cx, char * cdst); typedef void (*ggml_sycl_op_mul_mat_t)( ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, @@ -1337,83 +1330,6 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, reinterpret_cast(y[ib].ds.y()) = sum; } -template -static void k_get_rows( - const void * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2)) * - 2; - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; - - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(src0_row, ib, iqs, v); - - dst_row[iybs + iqs + 0] = v.x(); - dst_row[iybs + iqs + y_offset] = v.y(); -} - -template -static void k_get_rows_float( - const src0_t * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2); - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = src0_row[i00]; -} - static void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, @@ -1524,193 +1440,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous } } -static void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - sycl::half *dsti = (sycl::half *)cdsti; - - *dsti = sycl::vec(*xi) - .convert()[0]; -} - -static void cpy_1_f16_f16(const char * cxi, char * cdsti) { - const sycl::half *xi = (const sycl::half *)cxi; - sycl::half *dsti = (sycl::half *)cdsti; - - *dsti = *xi; -} - -static void cpy_1_f16_f32(const char * cxi, char * cdsti) { - const sycl::half *xi = (const sycl::half *)cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_i16_i16(const char * cxi, char * cdsti) { - const int16_t *xi = (const int16_t *)cxi; - int16_t *dsti = (int16_t *)cdsti; - - *dsti = *xi; -} - -static void cpy_1_i32_i32(const char * cxi, char * cdsti) { - const int32_t *xi = (const int32_t *)cxi; - int32_t *dsti = (int32_t *)cdsti; - - *dsti = *xi; -} - -template -static void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= ne) { - return; - } - - // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor - // then combine those indices with the corresponding byte offsets to get the total offsets - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; - - cpy_1(cx + x_offset, cdst + dst_offset); -} - -static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = sycl::fmax(amax, sycl::fabs((float)v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j]*id; - - dsti->qs[j] = sycl::round((float)x0); - } -} - -static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float)v)) { - amax = sycl::fabs((float)v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_0/2 + j]*id; - - const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - if (v < vmin) vmin = v; - if (v > vmax) vmax = v; - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = vmin; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (xi[0 + j] - vmin)*id; - const float x1 = (xi[QK4_1/2 + j] - vmin)*id; - - const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -template -static void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, const sycl::nd_item<3> &item_ct1) { - const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2)) * - qk; - - if (i >= ne) { - return; - } - - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; - - cpy_blck(cx + x_offset, cdst + dst_offset); -} - static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(1); @@ -1729,247 +1458,6 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } -template -static inline void ggml_sycl_swap(T & a, T & b) { - T tmp = a; - a = b; - b = tmp; -} - -template -__dpct_inline__ static void -k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad, - const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) { - // bitonic sort - int col = item_ct1.get_local_id(2); - int row = item_ct1.get_group(1); - - if (col >= ncols_pad) { - return; - } - - const float * x_row = x + row * ncols; - auto dst_row = (int *)dpct_local; - - // initialize indices - dst_row[col] = col; - - item_ct1.barrier(sycl::access::fence_space::local_space); - - for (int k = 2; k <= ncols_pad; k *= 2) { - for (int j = k / 2; j > 0; j /= 2) { - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (dst_row[col] >= ncols || - (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? - x_row[dst_row[col]] > x_row[dst_row[ixj]] : - x_row[dst_row[col]] < x_row[dst_row[ixj]])) - ) { - ggml_sycl_swap(dst_row[col], dst_row[ixj]); - } - } else { - if (dst_row[ixj] >= ncols || - (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? - x_row[dst_row[col]] < x_row[dst_row[ixj]] : - x_row[dst_row[col]] > x_row[dst_row[ixj]])) - ) { - ggml_sycl_swap(dst_row[col], dst_row[ixj]); - } - } - } - /* - DPCT1118:1: SYCL group functions and algorithms must be encountered - in converged control flow. You may need to adjust the code. - */ - item_ct1.barrier(sycl::access::fence_space::local_space); - } - } - - // copy the result to dst without the padding - if (col < ncols) { - dst[row * ncols + col] = dst_row[col]; - } -} - - -static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, - const sycl::nd_item<3> &item_ct1) { - const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (col >= ncols) { - return; - } - - const int i = row*ncols + col; - //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; - //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU - dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; -} - -static void scale_f32(const float * x, float * dst, const float scale, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - dst[i] = scale * x[i]; -} - -static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); -} - -template -static void pool2d_nchw_kernel( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const Ti* src, To* dst, const enum ggml_op_pool op, - const sycl::nd_item<3> &item_ct1) { - int idx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (idx >= parallel_elements) { - return; - } - - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = sycl::max(0, start_h); - const int eh = sycl::min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = sycl::max(0, start_w); - const int ew = sycl::min(iw, start_w + kw); - - To res = 0; - - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { -#if DPCT_COMPATIBILITY_TEMP >= 350 - /* - DPCT1098:106: The '*' expression is used instead of the __ldg - call. These two expressions do not provide the exact same - functionality. Check the generated code for potential precision - and/or performance issues. - */ - Ti cur = *(i_ptr + i * iw + j); -#else - Ti cur = i_ptr[i * iw + j]; -#endif - switch (op) { - case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; - case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - } - } - o_ptr[cur_oh * ow + cur_ow] = res; -} - -template -static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const void *src0_dd, - const int32_t *src1_dd, float *dst_dd, - queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE); - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - GGML_ASSERT(ne00 % 2 == 0); - - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows( - src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} - -template -static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const src0_t *src0_dd, const int32_t *src1_dd, - float *dst_dd, queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - } - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, const int ky, const int kx_padded, @@ -2034,386 +1522,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } -static void -ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, - const int ne01, const int ne02, const int nb00, - const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, - const int nb10, const int nb11, const int nb12, - const int nb13, queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, - nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK8_0 == 0); - const int num_blocks = ne / QK8_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK4_0 == 0); - const int num_blocks = ne / QK4_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - GGML_ASSERT(ne % QK4_1 == 0); - const int num_blocks = ne / QK4_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), - sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q( - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne, - const int ne00, const int ne01, - const int ne02, const int nb00, - const int nb01, const int nb02, - const int nb03, const int ne10, - const int ne11, const int ne12, - const int nb10, const int nb11, - const int nb12, const int nb13, - queue_ptr stream) { - - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); - } -} - -static void scale_f32_sycl(const float *x, float *dst, const float scale, - const int k, queue_ptr stream) { - const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - scale_f32(x, dst, scale, k, item_ct1); - }); -} - -static void clamp_f32_sycl(const float *x, float *dst, const float min, - const float max, const int k, - queue_ptr stream) { - const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - clamp_f32(x, dst, min, max, k, item_ct1); - }); -} - -static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, - const int nrows, queue_ptr stream) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - const sycl::range<3> block_nums(1, nrows, 1); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[intel::reqd_sub_group_size(WARP_SIZE)]] { - k_sum_rows_f32(x, dst, ncols, item_ct1); - }); -} - -static int next_power_of_2(int x) { - int n = 1; - while (n < x) { - n *= 2; - } - return n; -} - -static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, - const int nrows, ggml_sort_order order, - queue_ptr stream) { - // bitonic sort requires ncols to be power of 2 - const int ncols_pad = next_power_of_2(ncols); - - const sycl::range<3> block_dims(1, 1, ncols_pad); - const sycl::range<3> block_nums(1, nrows, 1); - const size_t shared_mem = ncols_pad * sizeof(int); - - if (order == GGML_SORT_ORDER_ASC) { - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor dpct_local_acc_ct1( - sycl::range<1>(shared_mem), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32( - x, dst, ncols, ncols_pad, item_ct1, - dpct_local_acc_ct1.get_multi_ptr() - .get()); - }); - }); - } else if (order == GGML_SORT_ORDER_DESC) { - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor dpct_local_acc_ct1( - sycl::range<1>(shared_mem), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32( - x, dst, ncols, ncols_pad, item_ct1, - dpct_local_acc_ct1.get_multi_ptr() - .get()); - }); - }); - } else { - GGML_ABORT("fatal error"); - } -} - -static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols, - const int nrows, queue_ptr stream) { - const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE); - const sycl::range<3> block_nums(1, nrows, 1); - const size_t shared_mem = 256 * sizeof(float); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor shared_data( - sycl::range<1>(shared_mem/sizeof(float)), cgh); - sycl::local_accessor shared_indices( - sycl::range<1>(shared_mem/sizeof(float)), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - const int tid = item_ct1.get_local_id(2); - const int row = item_ct1.get_global_id(1); - - float max_val = -INFINITY; - int max_idx = -1; - - for (int col = tid; col < ncols; col += 256) { - float val = x[row * ncols + col]; - if (val > max_val) { - max_val = val; - max_idx = col; - } - } - - shared_data[tid] = max_val; - shared_indices[tid] = max_idx; - item_ct1.barrier(sycl::access::fence_space::local_space); - - for (int stride = 256/2; stride > 0; stride >>= 1) { - if (tid < stride) { - float val1 = shared_data[tid]; - float val2 = shared_data[tid + stride]; - if (val2 > val1) { - shared_data[tid] = val2; - shared_indices[tid] = shared_indices[tid + stride]; - } - } - item_ct1.barrier(sycl::access::fence_space::local_space); - } - - - if (tid == 0) { - dst[row] = shared_indices[0]; - } - }); - }); -} -static void diag_mask_inf_f32_sycl(const float *x, float *dst, - const int ncols_x, const int nrows_x, - const int rows_per_channel, const int n_past, - queue_ptr stream) { - const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1); - const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE; - const sycl::range<3> block_nums(1, block_num_x, nrows_x); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - diag_mask_inf_f32(x, dst, ncols_x, - rows_per_channel, n_past, - item_ct1); - }); -} - static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, const struct ggml_tensor *src, int64_t i3, int64_t i2, @@ -2494,64 +1602,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_d, const float *src1_d, - float *dst_d, const queue_ptr &stream) { - - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - - const int32_t * src1_i32 = (const int32_t *) src1_d; - - switch (src0->type) { - case GGML_TYPE_F16: - get_rows_sycl_float(ctx, src0, src1, dst, (const sycl::half *)src0_d, - src1_i32, dst_d, stream); - break; - case GGML_TYPE_F32: - get_rows_sycl_float(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q4_0: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q4_1: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q5_0: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q5_1: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q8_0: - get_rows_sycl(ctx, src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - default: - // TODO: k-quants - GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ABORT("fatal error"); - break; - } -} - - -static void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_d, const float *src1_d, - float *dst_d, - const queue_ptr &main_stream) { - - ggml_sycl_op_bin_bcast>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(src1_d); -} - inline void ggml_sycl_op_mul_mat_sycl( ggml_backend_sycl_context & ctx, @@ -2685,202 +1735,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - const int64_t IH = src0->ne[1]; - const int64_t IW = src0->ne[0]; - - const int64_t N = dst->ne[3]; - const int64_t OC = dst->ne[2]; - const int64_t OH = dst->ne[1]; - const int64_t OW = dst->ne[0]; - - const int parallel_elements = N * OC * OH * OW; - const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; - sycl::range<3> block_nums(1, 1, num_blocks); - main_stream->parallel_for( - sycl::nd_range<3>(block_nums * - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, - parallel_elements, src0_dd, dst_dd, op, - item_ct1); - }); - - GGML_UNUSED(src1); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne = ggml_nelements(src0); - - sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - - argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, - float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - argmax_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int nrows0 = ggml_nrows(src0); - - const int n_past = ((int32_t *) dst->op_params)[0]; - - diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - float scale; - memcpy(&scale, dst->op_params, sizeof(float)); - - scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); - /* - DPCT1010:87: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - SYCL_CHECK(0); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - -inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, - const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - float min; - float max; - memcpy(&min, dst->op_params, sizeof(float)); - memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); - - clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); - /* - DPCT1010:88: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - SYCL_CHECK(0); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); -} - static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -3246,37 +2100,6 @@ catch (sycl::exception const &exc) { std::exit(1); } - -static void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_repeat); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_get_rows); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_norm); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rms_norm); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - -static void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_SYCL_DEBUG("call %s\n", __func__); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_group_norm); - GGML_SYCL_DEBUG("call %s done\n", __func__); -} - static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { @@ -3557,25 +2380,41 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor if (src0->ne[3] == 1 && src1->ne[3] == 1) { // KQ single-batch // mmv p021 was specific for these dimensions + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_p021\n", __func__); ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_p021 done\n", __func__); } else { // The kernel from the if path is faster for that specific case, but does not support all mul mats. + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl\n", __func__); ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl done\n", __func__); } } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_nc\n", __func__); ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_vec_nc done\n", __func__); } else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl\n", __func__); ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst); + GGML_SYCL_DEBUG("%s: call ggml_sycl_mul_mat_batched_sycl done\n", __func__); } else if (use_dequantize_mul_mat_vec) { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_dequantize_mul_mat_vec\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_dequantize_mul_mat_vec done\n", __func__); } else if (use_mul_mat_vec_q) { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_vec_q\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_vec_q done\n", __func__); } else if (use_mul_mat_q) { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_q\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_q done\n", __func__); } else { + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_sycl\n", __func__); ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); + GGML_SYCL_DEBUG("%s: call ggml_sycl_op_mul_mat_sycl done\n", __func__); } } @@ -3646,7 +2485,7 @@ __dpct_inline__ static void k_copy_dst_from_contiguous( } static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, - ggml_tensor *dst) try { + ggml_tensor * dst) try { const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer) && "mul_mat_id does not support split buffers"); @@ -3814,104 +2653,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_scale); -} - -static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_clamp); -} - -static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst) try { - const int64_t ne = ggml_nelements(src0); - GGML_ASSERT(ne == ggml_nelements(src1)); - - GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); - GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - - GGML_TENSOR_BINARY_OP_LOCALS01; - - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - queue_ptr main_stream = ctx.stream(); - - char * src0_ddc = (char *) src0->data; - char * src1_ddc = (char *) src1->data; - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { - ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else { - GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, - ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } - GGML_UNUSED(dst); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - // TODO: why do we pass dst as src1 here? - ggml_sycl_cpy(ctx, dst->src[0], dst, nullptr); -} - -static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf); -} - -static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope); -} - -static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_pool2d); -} - -static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_im2col); -} - -static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum); -} - -static void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_sum_rows); -} - -static void ggml_sycl_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argsort); -} - -static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(dst->src[0])); - ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_argmax); -} - - void ggml_sycl_set_main_device(const int main_device) try { if (dpct::get_current_device_id() == static_cast (main_device)) { return; @@ -3945,7 +2686,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_argmax(ctx, dst); break; case GGML_OP_CONV_TRANSPOSE_1D: - ggml_sycl_op_conv_transpose_1d(ctx, dst); + ggml_sycl_conv_transpose_1d(ctx, dst); break; case GGML_OP_REPEAT: ggml_sycl_repeat(ctx, dst); @@ -4021,7 +2762,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_group_norm(ctx, dst); break; case GGML_OP_CONCAT: - ggml_sycl_op_concat(ctx, dst); + ggml_sycl_concat(ctx, dst); break; case GGML_OP_UPSCALE: ggml_sycl_upscale(ctx, dst); @@ -4070,7 +2811,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_clamp(ctx, dst); break; case GGML_OP_CPY: - ggml_sycl_cpy(ctx, dst->src[0], dst->src[1], dst); + ggml_sycl_cpy(ctx, dst->src[0], dst->src[1]); break; case GGML_OP_CONT: ggml_sycl_dup(ctx, dst); @@ -4086,7 +2827,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_diag_mask_inf(ctx, dst); break; case GGML_OP_SOFT_MAX: - ggml_sycl_op_soft_max(ctx, dst); + ggml_sycl_softmax(ctx, dst); break; case GGML_OP_ROPE: ggml_sycl_rope(ctx, dst); @@ -4113,7 +2854,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens ggml_sycl_op_rwkv_wkv6(ctx, dst); break; case GGML_OP_GATED_LINEAR_ATTN: - ggml_sycl_op_gated_linear_attn(ctx, dst); + ggml_sycl_gated_linear_attn(ctx, dst); break; default: return false; @@ -4340,7 +3081,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { } int ggml_backend_sycl_get_device_count() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); + // GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count:\n"); return ggml_sycl_info().device_count; } @@ -4537,14 +3278,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - case GGML_OP_NORM: case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_LOG: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: + return true; + case GGML_OP_NORM: + case GGML_OP_GROUP_NORM: case GGML_OP_RMS_NORM: + return ggml_is_contiguous(op->src[0]); case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SQRT: @@ -4576,7 +3320,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: - case GGML_OP_GROUP_NORM: case GGML_OP_UPSCALE: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: diff --git a/ggml/src/ggml-sycl/gla.cpp b/ggml/src/ggml-sycl/gla.cpp index eedb47486430a..dd01ec3f8ab07 100644 --- a/ggml/src/ggml-sycl/gla.cpp +++ b/ggml/src/ggml-sycl/gla.cpp @@ -75,7 +75,7 @@ static void gated_linear_attn_f32_kernel(const dpct::queue_ptr stream, u_int B, }); } -void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const float * k_d = static_cast(dst->src[0]->data); const float * v_d = static_cast(dst->src[1]->data); const float * r_d = static_cast(dst->src[2]->data); @@ -88,6 +88,7 @@ void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor const int64_t H = dst->src[0]->ne[1]; dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); GGML_ASSERT(dst->src[4]->type == GGML_TYPE_F32); GGML_ASSERT(C % H == 0); GGML_ASSERT(C / H == 64 || C / H == 128); @@ -102,4 +103,13 @@ void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor } else { gated_linear_attn_f32_kernel<128>(stream, B, T, C, H, scale, k_d, v_d, r_d, td_d, s_d, dst_d); } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_gated_linear_attn(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/gla.hpp b/ggml/src/ggml-sycl/gla.hpp index 607cf3a7f3049..a7e3af79db190 100644 --- a/ggml/src/ggml-sycl/gla.hpp +++ b/ggml/src/ggml-sycl/gla.hpp @@ -3,6 +3,6 @@ #include "common.hpp" -void ggml_sycl_op_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_gated_linear_attn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_GLA_HPP diff --git a/ggml/src/ggml-sycl/im2col.cpp b/ggml/src/ggml-sycl/im2col.cpp index 6146a99edbe77..6712290703f97 100644 --- a/ggml/src/ggml-sycl/im2col.cpp +++ b/ggml/src/ggml-sycl/im2col.cpp @@ -82,13 +82,11 @@ static void im2col_sycl( } } -void ggml_sycl_op_im2col( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream) { +static void ggml_sycl_op_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -100,27 +98,38 @@ void ggml_sycl_op_im2col( const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - const int64_t IC = src1->ne[is_2D ? 2 : 1]; - const int64_t IH = is_2D ? src1->ne[1] : 1; - const int64_t IW = src1->ne[0]; + const int64_t IC = dst->src[1]->ne[is_2D ? 2 : 1]; + const int64_t IH = is_2D ? dst->src[1]->ne[1] : 1; + const int64_t IW = dst->src[1]->ne[0]; - const int64_t KH = is_2D ? src0->ne[1] : 1; - const int64_t KW = src0->ne[0]; + const int64_t KH = is_2D ? dst->src[0]->ne[1] : 1; + const int64_t KW = dst->src[0]->ne[0]; const int64_t OH = is_2D ? dst->ne[2] : 1; const int64_t OW = dst->ne[1]; - const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - const int64_t batch = src1->ne[3]; - const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + const size_t delta_offset = dst->src[1]->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const int64_t batch = dst->src[1]->ne[3]; + const size_t batch_offset = dst->src[1]->nb[3] / 4; // nb is byte offset, src is type float32 + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); if (dst->type == GGML_TYPE_F16) { - im2col_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + const float * src1_dd = static_cast(dst->src[1]->data); + sycl::half * dst_dd = static_cast(dst->data); + im2col_sycl(src1_dd, dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); } else { - im2col_sycl(src1_dd, (float *)dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + im2col_sycl(src1_dd, dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} - GGML_UNUSED(src0); - GGML_UNUSED(src0_dd); - GGML_UNUSED(ctx); +void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_im2col(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/im2col.hpp b/ggml/src/ggml-sycl/im2col.hpp index 7db144fbbe524..9d51738f2c72a 100644 --- a/ggml/src/ggml-sycl/im2col.hpp +++ b/ggml/src/ggml-sycl/im2col.hpp @@ -15,9 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_im2col( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, - const queue_ptr &main_stream); +void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_IM2COL_HPP diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 9cf2be15575d8..1cf6e01f51abe 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -311,68 +311,87 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, } } -void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst, const float* src0_dd, - const float* src1_dd, float* dst_dd, - const queue_ptr& main_stream) { +static void ggml_sycl_op_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); float eps; memcpy(&eps, dst->op_params, sizeof(float)); - norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - (void)src1; - (void)dst; - (void)src1_dd; + norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream) { +static void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); int num_groups = dst->op_params[0]; float eps; memcpy(&eps, dst->op_params + 1, sizeof(float)); - int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream, ctx.device); - - (void)src1; - (void)dst; - (void)src1_dd; - GGML_UNUSED(ctx); + int group_size = dst->src[0]->ne[0] * dst->src[0]->ne[1] * ((dst->src[0]->ne[2] + num_groups - 1) / num_groups); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + group_norm_f32_sycl(src0_dd, dst_dd, num_groups, eps, group_size, dst->src[0]->ne[0] * dst->src[0]->ne[1] * dst->src[0]->ne[2], main_stream, ctx.device); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } -void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream) { +static void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); float eps; memcpy(&eps, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream, ctx.device); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_norm(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_rms_norm(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} - (void)src1; - (void)dst; - (void)src1_dd; +void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_group_norm(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/norm.hpp b/ggml/src/ggml-sycl/norm.hpp index a9ad9156fa33e..a227a9bc0f3f7 100644 --- a/ggml/src/ggml-sycl/norm.hpp +++ b/ggml/src/ggml-sycl/norm.hpp @@ -15,21 +15,8 @@ #include "common.hpp" -void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, const ggml_tensor* src1, - ggml_tensor* dst, const float* src0_dd, - const float* src1_dd, float* dst_dd, - const queue_ptr& main_stream); - -void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream); - -void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, const ggml_tensor* src0, - const ggml_tensor* src1, ggml_tensor* dst, - const float* src0_dd, const float* src1_dd, - float* dst_dd, - const queue_ptr& main_stream); +void ggml_sycl_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_group_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_NORM_HPP diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index 8e8347ff4f95e..d21196a214db8 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -17,6 +17,8 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { // Get SYCL queue dpct::queue_ptr stream = ctx.stream(); + // set device + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); // Dimension checks GGML_ASSERT(ne01 == ne11); // Inner dimensions must match @@ -37,6 +39,7 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans; const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); + GGML_SYCL_DEBUG("call %s\n", __func__); try { // Perform matrix multiplication using oneMKL GEMM @@ -53,4 +56,5 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { std::cerr << exc.what() << std::endl; GGML_ASSERT(false); } + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/pool2d.cpp b/ggml/src/ggml-sycl/pool2d.cpp new file mode 100644 index 0000000000000..b010e7d3a11e5 --- /dev/null +++ b/ggml/src/ggml-sycl/pool2d.cpp @@ -0,0 +1,115 @@ +#include "pool2d.hpp" +#include + +template +static void pool2d_nchw_kernel(const int ih, const int iw, const int oh, const int ow, const int kh, const int kw, + const int sh, const int sw, const int ph, const int pw, const int parallel_elements, + const Ti * src, To * dst, const enum ggml_op_pool op, + const sycl::nd_item<3> & item_ct1) { + int idx = item_ct1.get_local_id(2) + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti * i_ptr = src + nc * I_HW; + To * o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = sycl::max(0, start_h); + const int eh = sycl::min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = sycl::max(0, start_w); + const int ew = sycl::min(iw, start_w + kw); + + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: + res = 0; + break; + case GGML_OP_POOL_MAX: + res = -FLT_MAX; + break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { +#if DPCT_COMPATIBILITY_TEMP >= 350 + /* + DPCT1098:106: The '*' expression is used instead of the __ldg + call. These two expressions do not provide the exact same + functionality. Check the generated code for potential precision + and/or performance issues. + */ + Ti cur = *(i_ptr + i * iw + j); +#else + Ti cur = i_ptr[i * iw + j]; +#endif + switch (op) { + case GGML_OP_POOL_AVG: + res += (cur / (kh * kw)); + break; + case GGML_OP_POOL_MAX: + res = sycl::max(res, (To) cur); + break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + +static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const int32_t * opts = (const int32_t *) dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = dst->src[0]->ne[1]; + const int64_t IW = dst->src[0]->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + sycl::range<3> block_nums(1, 1, num_blocks); + main_stream->parallel_for(sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, + dst_dd, op, item_ct1); + }); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_pool2d(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/pool2d.hpp b/ggml/src/ggml-sycl/pool2d.hpp new file mode 100644 index 0000000000000..6b2ce8043c951 --- /dev/null +++ b/ggml/src/ggml-sycl/pool2d.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_POOL2D_HPP +#define GGML_SYCL_POOL2D_HPP + +#include "common.hpp" + +void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_POOL2D_HPP diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp index 1244b231af738..78fe1d0012078 100644 --- a/ggml/src/ggml-sycl/rope.cpp +++ b/ggml/src/ggml-sycl/rope.cpp @@ -192,18 +192,17 @@ static void rope_neox_sycl( } } -void ggml_sycl_op_rope( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream) { - const ggml_tensor * src2 = dst->src[2]; +static void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); - GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - GGML_ASSERT(src0->type == dst->type); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->src[0]->type == dst->type); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->src[1]->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t nr = ggml_nrows(src0); + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t ne01 = dst->src[0]->ne[1]; + const int64_t nr = ggml_nrows(dst->src[0]); //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_dims = ((int32_t *) dst->op_params)[1]; @@ -228,49 +227,61 @@ void ggml_sycl_op_rope( const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const int32_t * pos = (const int32_t *) src1_dd; - + const int32_t * pos = static_cast(dst->src[1]->data); const float * freq_factors = nullptr; - if (src2 != nullptr) { - freq_factors = (const float *) src2->data; + if (dst->src[2] != nullptr) { + freq_factors = static_cast(dst->src[2]->data); } rope_corr_dims corr_dims; ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); // compute if (is_neox) { - if (src0->type == GGML_TYPE_F32) { - rope_neox_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_neox_sycl( - (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, freq_factors, main_stream - ); + if (dst->src[0]->type == GGML_TYPE_F32) { + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + rope_neox_sycl(src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream); + } else if (dst->src[0]->type == GGML_TYPE_F16) { + const sycl::half * src0_dd = static_cast(dst->src[0]->data); + sycl::half * dst_dd = static_cast(dst->data); + rope_neox_sycl(src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, + freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream); } else { GGML_ABORT("fatal error"); } } else { - if (src0->type == GGML_TYPE_F32) { + if (dst->src[0]->type == GGML_TYPE_F32) { + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); rope_norm_sycl( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream ); - } else if (src0->type == GGML_TYPE_F16) { + } else if (dst->src[0]->type == GGML_TYPE_F16) { + const sycl::half * src0_dd = static_cast(dst->src[0]->data); + sycl::half * dst_dd = static_cast(dst->data); rope_norm_sycl( - (const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, + src0_dd, dst_dd, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream ); } else { GGML_ABORT("fatal error"); } } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); - GGML_UNUSED(ctx); +void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT( + ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_rope(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp index 00354c3131bd7..b2824c510ee37 100644 --- a/ggml/src/ggml-sycl/rope.hpp +++ b/ggml/src/ggml-sycl/rope.hpp @@ -15,8 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_rope( - ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const float *src0_dd, const float *src1_dd, float *dst_dd, const queue_ptr &main_stream); +void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_ROPE_HPP diff --git a/ggml/src/ggml-sycl/scale.cpp b/ggml/src/ggml-sycl/scale.cpp new file mode 100644 index 0000000000000..74c1178367ed3 --- /dev/null +++ b/ggml/src/ggml-sycl/scale.cpp @@ -0,0 +1,49 @@ +#include "scale.hpp" + +static void scale_f32(const float * x, float * dst, const float scale, const int k, const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + +static void scale_f32_sycl(const float * x, float * dst, const float scale, const int k, queue_ptr stream) { + const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { scale_f32(x, dst, scale, k, item_ct1); }); +} + +inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(dst->src[0]), main_stream); + /* + DPCT1010:87: SYCL uses exceptions to report errors and does not use the + error codes. The call was replaced with 0. You need to rewrite this code. + */ + SYCL_CHECK(0); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_scale(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/scale.hpp b/ggml/src/ggml-sycl/scale.hpp new file mode 100644 index 0000000000000..d079a1e5b7005 --- /dev/null +++ b/ggml/src/ggml-sycl/scale.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_SCALE_HPP +#define GGML_SYCL_SCALE_HPP + +#include "common.hpp" + +void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_SCALE_HPP diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index 563e0655f5527..b2b73dc68d54a 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -224,10 +224,11 @@ static void soft_max_f32_sycl(const float * x, const T * mask, } } -void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +static void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional @@ -244,18 +245,31 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { const float * src0_dd = static_cast(dst->src[0]->data); float * dst_dd = static_cast(dst->data); - ggml_sycl_set_device(ctx.device); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); dpct::queue_ptr main_stream = ctx.stream(); if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) { const sycl::half * src1_dd = static_cast(dst->src[1]->data); + GGML_SYCL_DEBUG("%s: Mask precision: F16\n", __func__); soft_max_f32_sycl(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); } else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) { const float * src1_dd = static_cast(dst->src[1]->data); + GGML_SYCL_DEBUG("%s: Mask precision: F32\n", __func__); soft_max_f32_sycl(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); } else { /* mask unavailable */ - soft_max_f32_sycl(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); + GGML_SYCL_DEBUG("%s: No mask supplied\n", __func__); + soft_max_f32_sycl(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, + ctx.device); } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_softmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_soft_max(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); } diff --git a/ggml/src/ggml-sycl/softmax.hpp b/ggml/src/ggml-sycl/softmax.hpp index 2cf8582ec92e9..0c12a530dfe1a 100644 --- a/ggml/src/ggml-sycl/softmax.hpp +++ b/ggml/src/ggml-sycl/softmax.hpp @@ -15,6 +15,6 @@ #include "common.hpp" -void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst); +void ggml_sycl_softmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst); #endif // GGML_SYCL_SOFTMAX_HPP diff --git a/ggml/src/ggml-sycl/sum.cpp b/ggml/src/ggml-sycl/sum.cpp new file mode 100644 index 0000000000000..acec1493a4f63 --- /dev/null +++ b/ggml/src/ggml-sycl/sum.cpp @@ -0,0 +1,74 @@ +#include "sum.hpp" + +static void k_sum_rows_f32(const float * x, float * dst, const int ncols, const sycl::nd_item<3> & item_ct1) { + const int row = item_ct1.get_group(1); + const int col = item_ct1.get_local_id(2); + + float sum = 0.0f; + for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum, item_ct1); + + if (col == 0) { + dst[row] = sum; + } +} + +static void sum_rows_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, queue_ptr stream) { + const sycl::range<3> block_dims(1, 1, WARP_SIZE); + const sycl::range<3> block_nums(1, nrows, 1); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(WARP_SIZE)]] { k_sum_rows_f32(x, dst, ncols, item_ct1); }); +} + +inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const int64_t ne = ggml_nelements(dst->src[0]); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + + const int64_t ncols = dst->src[0]->ne[0]; + const int64_t nrows = ggml_nrows(dst->src[0]); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_sum(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(dst->src[0])); + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_sum_rows(ctx, dst); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} diff --git a/ggml/src/ggml-sycl/sum.hpp b/ggml/src/ggml-sycl/sum.hpp new file mode 100644 index 0000000000000..d1b8e5a7c468e --- /dev/null +++ b/ggml/src/ggml-sycl/sum.hpp @@ -0,0 +1,9 @@ +#ifndef GGML_SYCL_SUM_HPP +#define GGML_SYCL_SUM_HPP + +#include "common.hpp" + +void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_SUM_HPP diff --git a/ggml/src/ggml-sycl/tsembd.cpp b/ggml/src/ggml-sycl/tsembd.cpp index b877d18c1730a..a5aedd4d0e252 100644 --- a/ggml/src/ggml-sycl/tsembd.cpp +++ b/ggml/src/ggml-sycl/tsembd.cpp @@ -55,11 +55,10 @@ static void timestep_embedding_f32_sycl( }); } -void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tensor * dst) try { const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - float * dst_d = (float *)dst->data; + const float * src0_d = static_cast(src0->data); + float * dst_d = static_cast(dst->data); dpct::queue_ptr stream = ctx.stream(); GGML_ASSERT(src0->type == GGML_TYPE_F32); @@ -67,7 +66,10 @@ void ggml_sycl_op_timestep_embedding(ggml_backend_sycl_context & ctx, ggml_tenso const int dim = dst->op_params[0]; const int max_period = dst->op_params[1]; - + GGML_SYCL_DEBUG("call %s\n", __func__); timestep_embedding_f32_sycl(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); - GGML_UNUSED(src1); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); } diff --git a/ggml/src/ggml-sycl/wkv6.cpp b/ggml/src/ggml-sycl/wkv6.cpp index b54c20964ed5d..4c01f29b98a57 100644 --- a/ggml/src/ggml-sycl/wkv6.cpp +++ b/ggml/src/ggml-sycl/wkv6.cpp @@ -95,10 +95,7 @@ static void rwkv_wkv_f32_kernel( } } -void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { - - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; +void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) try { const float* k_d = (const float*)dst->src[0]->data; const float* v_d = (const float*)dst->src[1]->data; @@ -118,11 +115,13 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { GGML_ASSERT(C / H == WKV_BLOCK_SIZE); // The current sycl kernel is designed for RWKV6, HEAD_SIZE == 64 dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); // Calculate execution configuration const size_t shared_mem_size = WKV_BLOCK_SIZE * 4 * sizeof(float); // For k, r, tf, td sycl::range<3> block_dims(1, 1, C / H); sycl::range<3> grid_dims(1, 1, B * H); + GGML_SYCL_DEBUG("call %s", __func__); // Submit kernel stream->submit([&](sycl::handler& cgh) { @@ -137,7 +136,9 @@ void ggml_sycl_op_rwkv_wkv6(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { ); }); }); + GGML_SYCL_DEBUG("call %s done", __func__); - GGML_UNUSED(src0); - GGML_UNUSED(src1); +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); }