From a7e15b03559ed3bc91e1648f225943455cdb0a4b Mon Sep 17 00:00:00 2001 From: nscipione Date: Fri, 29 Nov 2024 11:29:00 +0000 Subject: [PATCH 1/3] [SYCL] Move to Compile Time backend selection on oneMKL Interface for NVIDIA backend Move to compile time selection to backend to avoid latency at run time. Add it to all mkl gemm calls and only for NVIDIA backend. Signed-off-by: nscipione --- ggml/src/ggml-sycl/CMakeLists.txt | 3 ++- ggml/src/ggml-sycl/dpct/helper.hpp | 21 ++++++++++++++++++--- ggml/src/ggml-sycl/ggml-sycl.cpp | 7 ++++++- ggml/src/ggml-sycl/outprod.cpp | 7 ++++++- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 83f223fd7b6fc..3579a311aac07 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -68,7 +68,8 @@ else() target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") - target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl) + add_compile_definitions(GGML_SYCL_NVIDIA) + target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas) elseif (GGML_SYCL_TARGET STREQUAL "AMD") if (NOT GGML_SYCL_DEVICE_ARCH) message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index c2f28bb49579e..b92411cc3e4d1 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -1690,7 +1690,12 @@ namespace dpct auto data_b = get_memory(b); auto data_c = get_memory(c); oneapi::mkl::blas::column_major::gemm( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::backend_selector{q}, +#else + q, +#endif + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, beta_value, data_c, ldc); } @@ -1755,7 +1760,12 @@ namespace dpct matrix_info->groupsize_info = batch_size; sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( - q, matrix_info->transpose_info, matrix_info->transpose_info + 1, +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::backend_selector{q}, +#else + q, +#endif + matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast(a), matrix_info->ld_info, @@ -1784,7 +1794,12 @@ namespace dpct auto data_b = get_memory(b); auto data_c = get_memory(c); oneapi::mkl::blas::column_major::gemm_batch( - q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::backend_selector{q}, +#else + q, +#endif + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c, batch_size); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 808f74fa0b27e..0f97bc7eff13a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2562,7 +2562,12 @@ inline void ggml_sycl_op_mul_mat_sycl( const float beta = 0.0f; #if !GGML_SYCL_DNNL SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( - *stream, oneapi::mkl::transpose::trans, +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::backend_selector{*stream}, +#else + *stream, +#endif + oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index e61cdc2ca5d53..7a2bfd445cbc0 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -40,7 +40,12 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr try { // Perform matrix multiplication using oneMKL GEMM - oneapi::mkl::blas::column_major::gemm(*stream, + oneapi::mkl::blas::column_major::gemm( +#ifdef GGML_SYCL_NVIDIA + oneapi::mkl::backend_selector{*stream}, +#else + *stream, +#endif oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, From ffd0a998c71466c5122b2c488cd725dc4ec6f1a4 Mon Sep 17 00:00:00 2001 From: nscipione Date: Fri, 29 Nov 2024 14:40:35 +0000 Subject: [PATCH 2/3] Formatting --- ggml/src/ggml-sycl/dpct/helper.hpp | 28 ++++++++++++---------------- ggml/src/ggml-sycl/ggml-sycl.cpp | 14 ++++++-------- ggml/src/ggml-sycl/outprod.cpp | 11 +++-------- 3 files changed, 21 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index b92411cc3e4d1..cc9304e1ae4d1 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -1691,12 +1691,11 @@ namespace dpct auto data_c = get_memory(c); oneapi::mkl::blas::column_major::gemm( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{q}, + oneapi::mkl::backend_selector{ q }, #else - q, + q, #endif - a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - data_b, ldb, beta_value, data_c, ldc); + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, beta_value, data_c, ldc); } template @@ -1761,16 +1760,14 @@ namespace dpct sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{q}, + oneapi::mkl::backend_selector{ q }, #else - q, + q, #endif - matrix_info->transpose_info, matrix_info->transpose_info + 1, - matrix_info->size_info, matrix_info->size_info + 1, - matrix_info->size_info + 2, matrix_info->value_info, - reinterpret_cast(a), matrix_info->ld_info, - reinterpret_cast(b), matrix_info->ld_info + 1, - matrix_info->value_info + 1, reinterpret_cast(c), + matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, + matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info, + reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), + matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); q.submit([&](sycl::handler &cgh) @@ -1795,12 +1792,11 @@ namespace dpct auto data_c = get_memory(c); oneapi::mkl::blas::column_major::gemm_batch( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{q}, + oneapi::mkl::backend_selector{ q }, #else - q, + q, #endif - a_trans, b_trans, m, n, k, alpha_value, data_a, lda, - stride_a, data_b, ldb, stride_b, beta_value, + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c, batch_size); } diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 0f97bc7eff13a..b5ae9781c006b 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2562,15 +2562,13 @@ inline void ggml_sycl_op_mul_mat_sycl( const float beta = 0.0f; #if !GGML_SYCL_DNNL SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( -#ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{*stream}, -#else +# ifdef GGML_SYCL_NVIDIA + oneapi::mkl::backend_selector{ *stream }, +# else *stream, -#endif - oneapi::mkl::transpose::trans, - oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, - dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, - src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), +# endif + oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); #else auto dnnl_stream = ctx.stream_dnnl(stream); diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index 7a2bfd445cbc0..5b3c38315defe 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -42,17 +42,12 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr // Perform matrix multiplication using oneMKL GEMM oneapi::mkl::blas::column_major::gemm( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{*stream}, + oneapi::mkl::backend_selector{ *stream }, #else *stream, #endif - oneapi::mkl::transpose::nontrans, src1_op, - ne0, ne1, ne01, - alpha, - src0_d, ne00, - src1_d, ldb, - beta, - dst_d, ne0); + oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, + ne0); } catch (sycl::exception const& exc) { std::cerr << exc.what() << std::endl; From f6e6fc4d47507ec8ffb0b7107d6f6d548824074d Mon Sep 17 00:00:00 2001 From: nscipione Date: Mon, 2 Dec 2024 14:58:30 +0000 Subject: [PATCH 3/3] Address PR comments to increase readibility --- ggml/src/ggml-sycl/dpct/helper.hpp | 36 ++++++++++++++++++------------ ggml/src/ggml-sycl/ggml-sycl.cpp | 12 +++++----- ggml/src/ggml-sycl/outprod.cpp | 10 ++++----- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index cc9304e1ae4d1..d1b5dd87c6922 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -1689,13 +1689,14 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); - oneapi::mkl::blas::column_major::gemm( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{ q }, + oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector{ q }, + a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, + beta_value, data_c, ldc); #else - q, + oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, + beta_value, data_c, ldc); #endif - a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, beta_value, data_c, ldc); } template @@ -1758,17 +1759,22 @@ namespace dpct matrix_info->ld_info[2] = ldc; matrix_info->groupsize_info = batch_size; - sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{ q }, + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + oneapi::mkl::backend_selector{ q }, matrix_info->transpose_info, + matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, + matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast(a), + matrix_info->ld_info, reinterpret_cast(b), matrix_info->ld_info + 1, + matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, + &(matrix_info->groupsize_info)); #else - q, -#endif - matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, + sycl::event e = oneapi::mkl::blas::column_major::gemm_batch( + q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast(a), matrix_info->ld_info, reinterpret_cast(b), matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); +#endif q.submit([&](sycl::handler &cgh) { @@ -1790,14 +1796,16 @@ namespace dpct auto data_a = get_memory(a); auto data_b = get_memory(b); auto data_c = get_memory(c); - oneapi::mkl::blas::column_major::gemm_batch( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{ q }, + oneapi::mkl::blas::column_major::gemm_batch( + oneapi::mkl::backend_selector{ q }, a_trans, b_trans, m, n, k, + alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c, + batch_size); #else - q, + oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, + stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, + stride_c, batch_size); #endif - a_trans, b_trans, m, n, k, alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, - data_c, ldc, stride_c, batch_size); } } // namespace detail diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index b5ae9781c006b..708dbe36864eb 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -2561,15 +2561,17 @@ inline void ggml_sycl_op_mul_mat_sycl( const float alpha = 1.0f; const float beta = 0.0f; #if !GGML_SYCL_DNNL - SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( # ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{ *stream }, + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + oneapi::mkl::backend_selector{ *stream }, oneapi::mkl::transpose::trans, + oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, + ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); # else - *stream, -# endif - oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc))); +# endif #else auto dnnl_stream = ctx.stream_dnnl(stream); DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt(), diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index 5b3c38315defe..ef9af0b7633ab 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -40,14 +40,14 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, const ggml_tensor* sr try { // Perform matrix multiplication using oneMKL GEMM - oneapi::mkl::blas::column_major::gemm( #ifdef GGML_SYCL_NVIDIA - oneapi::mkl::backend_selector{ *stream }, + oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector{ *stream }, + oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d, + ne00, src1_d, ldb, beta, dst_d, ne0); #else - *stream, + oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, + src0_d, ne00, src1_d, ldb, beta, dst_d, ne0); #endif - oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, - ne0); } catch (sycl::exception const& exc) { std::cerr << exc.what() << std::endl;