From 95d10d725cd93befc6a06d7d50c8ed0e854f438d Mon Sep 17 00:00:00 2001 From: safranowith Date: Tue, 28 Oct 2025 12:52:04 +0200 Subject: [PATCH 1/2] open the first files --- ggml/src/ggml-sycl/flash-attention/flash_attention.cpp | 0 ggml/src/ggml-sycl/flash-attention/flash_attention.hpp | 0 ggml/src/ggml-sycl/flash-attention/kernels/flash_kernel.hpp | 0 ggml/src/ggml-sycl/flash-attention/softmax.hpp | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 ggml/src/ggml-sycl/flash-attention/flash_attention.cpp create mode 100644 ggml/src/ggml-sycl/flash-attention/flash_attention.hpp create mode 100644 ggml/src/ggml-sycl/flash-attention/kernels/flash_kernel.hpp create mode 100644 ggml/src/ggml-sycl/flash-attention/softmax.hpp diff --git a/ggml/src/ggml-sycl/flash-attention/flash_attention.cpp b/ggml/src/ggml-sycl/flash-attention/flash_attention.cpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ggml/src/ggml-sycl/flash-attention/flash_attention.hpp b/ggml/src/ggml-sycl/flash-attention/flash_attention.hpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ggml/src/ggml-sycl/flash-attention/kernels/flash_kernel.hpp b/ggml/src/ggml-sycl/flash-attention/kernels/flash_kernel.hpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ggml/src/ggml-sycl/flash-attention/softmax.hpp b/ggml/src/ggml-sycl/flash-attention/softmax.hpp new file mode 100644 index 00000000000..e69de29bb2d From bfec4fa706adf6b2ff1574d6adc990a9a5b05022 Mon Sep 17 00:00:00 2001 From: safranowith Date: Tue, 28 Oct 2025 12:55:14 +0200 Subject: [PATCH 2/2] open the first files --- ggml/src/ggml-sycl/CMakeLists.txt | 378 +- ggml/src/ggml-sycl/backend.hpp | 86 +- ggml/src/ggml-sycl/binbcast.cpp | 690 +- ggml/src/ggml-sycl/binbcast.hpp | 78 +- ggml/src/ggml-sycl/common.cpp | 166 +- ggml/src/ggml-sycl/common.hpp | 1240 ++-- ggml/src/ggml-sycl/concat.cpp | 382 +- ggml/src/ggml-sycl/concat.hpp | 40 +- ggml/src/ggml-sycl/conv.cpp | 202 +- ggml/src/ggml-sycl/conv.hpp | 40 +- ggml/src/ggml-sycl/convert.cpp | 1284 ++-- ggml/src/ggml-sycl/convert.hpp | 68 +- ggml/src/ggml-sycl/count-equal.cpp | 158 +- ggml/src/ggml-sycl/count-equal.hpp | 18 +- ggml/src/ggml-sycl/cpy.cpp | 1210 ++-- ggml/src/ggml-sycl/cpy.hpp | 446 +- ggml/src/ggml-sycl/dequantize.hpp | 1646 ++--- ggml/src/ggml-sycl/dmmv.cpp | 2324 +++--- ggml/src/ggml-sycl/dmmv.hpp | 54 +- ggml/src/ggml-sycl/dpct/helper.hpp | 5954 ++++++++-------- ggml/src/ggml-sycl/element_wise.cpp | 2488 +++---- ggml/src/ggml-sycl/element_wise.hpp | 180 +- ggml/src/ggml-sycl/gemm.hpp | 180 +- ggml/src/ggml-sycl/getrows.cpp | 430 +- ggml/src/ggml-sycl/getrows.hpp | 40 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 9460 ++++++++++++------------- ggml/src/ggml-sycl/gla.cpp | 212 +- ggml/src/ggml-sycl/gla.hpp | 16 +- ggml/src/ggml-sycl/im2col.cpp | 272 +- ggml/src/ggml-sycl/im2col.hpp | 42 +- ggml/src/ggml-sycl/mmq.cpp | 6060 ++++++++-------- ggml/src/ggml-sycl/mmq.hpp | 66 +- ggml/src/ggml-sycl/mmvq.cpp | 2268 +++--- ggml/src/ggml-sycl/mmvq.hpp | 54 +- ggml/src/ggml-sycl/norm.cpp | 1002 +-- ggml/src/ggml-sycl/norm.hpp | 52 +- ggml/src/ggml-sycl/outprod.cpp | 94 +- ggml/src/ggml-sycl/outprod.hpp | 20 +- ggml/src/ggml-sycl/pad.cpp | 194 +- ggml/src/ggml-sycl/pad.hpp | 48 +- ggml/src/ggml-sycl/pad_reflect_1d.cpp | 144 +- ggml/src/ggml-sycl/pad_reflect_1d.hpp | 16 +- ggml/src/ggml-sycl/presets.hpp | 152 +- ggml/src/ggml-sycl/quantize.hpp | 266 +- ggml/src/ggml-sycl/quants.hpp | 220 +- ggml/src/ggml-sycl/rope.cpp | 930 +-- ggml/src/ggml-sycl/rope.hpp | 40 +- ggml/src/ggml-sycl/set.cpp | 146 +- ggml/src/ggml-sycl/set.hpp | 10 +- ggml/src/ggml-sycl/set_rows.cpp | 468 +- ggml/src/ggml-sycl/set_rows.hpp | 16 +- ggml/src/ggml-sycl/softmax.cpp | 852 +-- ggml/src/ggml-sycl/softmax.hpp | 48 +- ggml/src/ggml-sycl/sycl_hw.cpp | 30 +- ggml/src/ggml-sycl/sycl_hw.hpp | 52 +- ggml/src/ggml-sycl/tsembd.cpp | 146 +- ggml/src/ggml-sycl/tsembd.hpp | 40 +- ggml/src/ggml-sycl/vecdotq.hpp | 2606 +++---- ggml/src/ggml-sycl/wkv.cpp | 586 +- ggml/src/ggml-sycl/wkv.hpp | 20 +- 60 files changed, 23215 insertions(+), 23215 deletions(-) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index efd78b912cc..20fb4c6aada 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -1,189 +1,189 @@ -message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}") - -if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$") - message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD") -endif() - -check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) - -if (DEFINED ENV{ONEAPI_ROOT}) - message(STATUS "Using oneAPI Release SYCL compiler (icpx).") -elseif(SUPPORTS_SYCL) - message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}. - If you expected the oneAPI Release compiler, please install oneAPI & source it, like: - source /opt/intel/oneapi/setvars.sh") -else() - message(FATAL_ERROR "C++ compiler lacks SYCL support.") -endif() -message(STATUS "SYCL found") -#todo: AOT - -ggml_add_backend_library(ggml-sycl - ggml-sycl.cpp - ../../include/ggml-sycl.h - ) - -file(GLOB GGML_HEADERS_SYCL "*.hpp") -file(GLOB GGML_SOURCES_SYCL "*.cpp") -target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL}) - -if (WIN32) - # To generate a Visual Studio solution, using Intel C++ Compiler for ggml-sycl is mandatory - if( ${CMAKE_GENERATOR} MATCHES "Visual Studio" AND NOT (${CMAKE_GENERATOR_TOOLSET} MATCHES "Intel C")) - set_target_properties(ggml-sycl PROPERTIES VS_PLATFORM_TOOLSET "Intel C++ Compiler 2025") - set(CMAKE_CXX_COMPILER "icx") - set(CMAKE_CXX_COMPILER_ID "IntelLLVM") - endif() -endif() - -find_package(IntelSYCL) -if (IntelSYCL_FOUND) - # Use oneAPI CMake when possible - target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX) -else() - # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance - target_compile_options(ggml-sycl PRIVATE "-fsycl") - target_link_options(ggml-sycl PRIVATE "-fsycl") -endif() - -target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") - -# Link against oneDNN -set(GGML_SYCL_DNNL 0) -if(GGML_SYCL_DNN) - find_package(DNNL) - if(DNNL_FOUND) - if (NOT DEFINED DNNL_GPU_VENDOR) - # default to intel target - set(DNNL_GPU_VENDOR "INTEL") - if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL") - message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target") - endif() - endif() - - # Verify oneDNN was compiled for the same target as llama - if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}") - target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl) - set(GGML_SYCL_DNNL 1) - get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS) - foreach(CONFIG ${CONFIGS}) - get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG}) - message(STATUS "Found oneDNN: ${DNNL_LIB}") - endforeach() - else() - message(WARNING - "oneDNN must be compiled for the same target as llama.cpp. - llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}. - Disabling oneDNN support.") - endif() - else() - message(STATUS "oneDNN not found, disabling oneDNN support") - endif() -else() - message(STATUS "oneDNN support disabled by the user") -endif() -target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL}) - -if (GGML_SYCL_F16) - if (GGML_SYCL_TARGET STREQUAL "AMD") - message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.") - endif() - add_compile_definitions(GGML_SYCL_F16) -endif() - -if (GGML_SYCL_TARGET STREQUAL "NVIDIA") - add_compile_definitions(GGML_SYCL_WARP_SIZE=32) -elseif (GGML_SYCL_TARGET STREQUAL "AMD") - # INFO: Allowed Sub_group_sizes are not consistent through all - # hip targets. For example, 64 is used for certain models, but the backend - # does not support it. - # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32) - add_compile_definitions(GGML_SYCL_WARP_SIZE=32) -else() - add_compile_definitions(GGML_SYCL_WARP_SIZE=16) -endif() - -if (GGML_SYCL_GRAPH) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH) -endif() - -# Link against Intel oneMKL or oneMath -if (GGML_SYCL_TARGET STREQUAL "INTEL") - # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically - # See https://github.com/uxlfoundation/oneMath/issues/654 - if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - set(SYCL_COMPILER ON) - endif() - find_package(MKL REQUIRED) - target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL) -else() - find_package(oneMath QUIET) - if (NOT oneMath_FOUND) - message(STATUS "oneMath not found: oneMath will be automatically downloaded") - # Use FetchContent to automatically pull and build oneMath - include(FetchContent) - set(BUILD_FUNCTIONAL_TESTS False) - set(BUILD_EXAMPLES False) - set(TARGET_DOMAINS blas) - if (GGML_SYCL_TARGET STREQUAL "NVIDIA") - set(ENABLE_MKLCPU_BACKEND False) - set(ENABLE_MKLGPU_BACKEND False) - set(ENABLE_CUBLAS_BACKEND True) - elseif (GGML_SYCL_TARGET STREQUAL "AMD") - set(ENABLE_MKLCPU_BACKEND False) - set(ENABLE_MKLGPU_BACKEND False) - set(ENABLE_ROCBLAS_BACKEND True) - # Ensure setting a string variable here is not overriden by oneMath CACHE variables - cmake_policy(SET CMP0126 NEW) - # Setting the device architecture is only needed and useful for AMD devices in oneMath - set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE) - endif() - FetchContent_Declare( - ONEMATH - GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git - GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142 - ) - FetchContent_MakeAvailable(ONEMATH) - # Create alias to match with find_package targets name - function(onemath_alias target) - if (TARGET ${target}_obj) - # Silence verbose warnings from external libraries - target_compile_options(${target}_obj PRIVATE -w) - endif() - if (TARGET ${target}) - add_library(ONEMATH::${target} ALIAS ${target}) - endif() - endfunction() - onemath_alias(onemath) - onemath_alias(onemath_blas_mklcpu) - onemath_alias(onemath_blas_mklgpu) - onemath_alias(onemath_blas_cublas) - onemath_alias(onemath_blas_rocblas) - endif() - - # Below oneMath compile-time dispatching is used for better performance - if (GGML_SYCL_TARGET STREQUAL "NVIDIA") - target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas) - target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda") - target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda") - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA) - elseif (GGML_SYCL_TARGET STREQUAL "AMD") - if (NOT GGML_SYCL_DEVICE_ARCH) - message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") - endif() - target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas) - target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") - target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD) - else() - # Fallback to oneMath runtime dispatcher - target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC) - endif() -endif() - -if (GGML_SYCL_DEVICE_ARCH) - target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) - target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) -endif() +message(STATUS "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}") + +if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD") +endif() + +check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) + +if (DEFINED ENV{ONEAPI_ROOT}) + message(STATUS "Using oneAPI Release SYCL compiler (icpx).") +elseif(SUPPORTS_SYCL) + message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}. + If you expected the oneAPI Release compiler, please install oneAPI & source it, like: + source /opt/intel/oneapi/setvars.sh") +else() + message(FATAL_ERROR "C++ compiler lacks SYCL support.") +endif() +message(STATUS "SYCL found") +#todo: AOT + +ggml_add_backend_library(ggml-sycl + ggml-sycl.cpp + ../../include/ggml-sycl.h + ) + +file(GLOB GGML_HEADERS_SYCL "*.hpp") +file(GLOB GGML_SOURCES_SYCL "*.cpp") +target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL}) + +if (WIN32) + # To generate a Visual Studio solution, using Intel C++ Compiler for ggml-sycl is mandatory + if( ${CMAKE_GENERATOR} MATCHES "Visual Studio" AND NOT (${CMAKE_GENERATOR_TOOLSET} MATCHES "Intel C")) + set_target_properties(ggml-sycl PROPERTIES VS_PLATFORM_TOOLSET "Intel C++ Compiler 2025") + set(CMAKE_CXX_COMPILER "icx") + set(CMAKE_CXX_COMPILER_ID "IntelLLVM") + endif() +endif() + +find_package(IntelSYCL) +if (IntelSYCL_FOUND) + # Use oneAPI CMake when possible + target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX) +else() + # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance + target_compile_options(ggml-sycl PRIVATE "-fsycl") + target_link_options(ggml-sycl PRIVATE "-fsycl") +endif() + +target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") + +# Link against oneDNN +set(GGML_SYCL_DNNL 0) +if(GGML_SYCL_DNN) + find_package(DNNL) + if(DNNL_FOUND) + if (NOT DEFINED DNNL_GPU_VENDOR) + # default to intel target + set(DNNL_GPU_VENDOR "INTEL") + if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL") + message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target") + endif() + endif() + + # Verify oneDNN was compiled for the same target as llama + if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}") + target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl) + set(GGML_SYCL_DNNL 1) + get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS) + foreach(CONFIG ${CONFIGS}) + get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG}) + message(STATUS "Found oneDNN: ${DNNL_LIB}") + endforeach() + else() + message(WARNING + "oneDNN must be compiled for the same target as llama.cpp. + llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}. + Disabling oneDNN support.") + endif() + else() + message(STATUS "oneDNN not found, disabling oneDNN support") + endif() +else() + message(STATUS "oneDNN support disabled by the user") +endif() +target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL}) + +if (GGML_SYCL_F16) + if (GGML_SYCL_TARGET STREQUAL "AMD") + message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.") + endif() + add_compile_definitions(GGML_SYCL_F16) +endif() + +if (GGML_SYCL_TARGET STREQUAL "NVIDIA") + add_compile_definitions(GGML_SYCL_WARP_SIZE=32) +elseif (GGML_SYCL_TARGET STREQUAL "AMD") + # INFO: Allowed Sub_group_sizes are not consistent through all + # hip targets. For example, 64 is used for certain models, but the backend + # does not support it. + # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32) + add_compile_definitions(GGML_SYCL_WARP_SIZE=32) +else() + add_compile_definitions(GGML_SYCL_WARP_SIZE=16) +endif() + +if (GGML_SYCL_GRAPH) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH) +endif() + +# Link against Intel oneMKL or oneMath +if (GGML_SYCL_TARGET STREQUAL "INTEL") + # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically + # See https://github.com/uxlfoundation/oneMath/issues/654 + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + set(SYCL_COMPILER ON) + endif() + find_package(MKL REQUIRED) + target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL) +else() + find_package(oneMath QUIET) + if (NOT oneMath_FOUND) + message(STATUS "oneMath not found: oneMath will be automatically downloaded") + # Use FetchContent to automatically pull and build oneMath + include(FetchContent) + set(BUILD_FUNCTIONAL_TESTS False) + set(BUILD_EXAMPLES False) + set(TARGET_DOMAINS blas) + if (GGML_SYCL_TARGET STREQUAL "NVIDIA") + set(ENABLE_MKLCPU_BACKEND False) + set(ENABLE_MKLGPU_BACKEND False) + set(ENABLE_CUBLAS_BACKEND True) + elseif (GGML_SYCL_TARGET STREQUAL "AMD") + set(ENABLE_MKLCPU_BACKEND False) + set(ENABLE_MKLGPU_BACKEND False) + set(ENABLE_ROCBLAS_BACKEND True) + # Ensure setting a string variable here is not overriden by oneMath CACHE variables + cmake_policy(SET CMP0126 NEW) + # Setting the device architecture is only needed and useful for AMD devices in oneMath + set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE) + endif() + FetchContent_Declare( + ONEMATH + GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git + GIT_TAG 8efe85f5aaebb37f1d8c503b7af66315feabf142 + ) + FetchContent_MakeAvailable(ONEMATH) + # Create alias to match with find_package targets name + function(onemath_alias target) + if (TARGET ${target}_obj) + # Silence verbose warnings from external libraries + target_compile_options(${target}_obj PRIVATE -w) + endif() + if (TARGET ${target}) + add_library(ONEMATH::${target} ALIAS ${target}) + endif() + endfunction() + onemath_alias(onemath) + onemath_alias(onemath_blas_mklcpu) + onemath_alias(onemath_blas_mklgpu) + onemath_alias(onemath_blas_cublas) + onemath_alias(onemath_blas_rocblas) + endif() + + # Below oneMath compile-time dispatching is used for better performance + if (GGML_SYCL_TARGET STREQUAL "NVIDIA") + target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas) + target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda") + target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda") + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA) + elseif (GGML_SYCL_TARGET STREQUAL "AMD") + if (NOT GGML_SYCL_DEVICE_ARCH) + message(FATAL_ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.") + endif() + target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas) + target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") + target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa") + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD) + else() + # Fallback to oneMath runtime dispatcher + target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC) + endif() +endif() + +if (GGML_SYCL_DEVICE_ARCH) + target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) + target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}) +endif() diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index b1575b81451..32d9d44f3a5 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -1,43 +1,43 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_BACKEND_HPP -#define GGML_SYCL_BACKEND_HPP - -#include "binbcast.hpp" -#include "common.hpp" -#include "concat.hpp" -#include "conv.hpp" -#include "convert.hpp" -#include "count-equal.hpp" -#include "cpy.hpp" -#include "dequantize.hpp" -#include "dmmv.hpp" -#include "element_wise.hpp" -#include "gla.hpp" -#include "im2col.hpp" -#include "mmq.hpp" -#include "mmvq.hpp" -#include "norm.hpp" -#include "outprod.hpp" -#include "pad.hpp" -#include "quantize.hpp" -#include "quants.hpp" -#include "rope.hpp" -#include "set_rows.hpp" -#include "softmax.hpp" -#include "tsembd.hpp" -#include "wkv.hpp" -#include "pad_reflect_1d.hpp" - - -#endif // GGML_SYCL_BACKEND_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_BACKEND_HPP +#define GGML_SYCL_BACKEND_HPP + +#include "binbcast.hpp" +#include "common.hpp" +#include "concat.hpp" +#include "conv.hpp" +#include "convert.hpp" +#include "count-equal.hpp" +#include "cpy.hpp" +#include "dequantize.hpp" +#include "dmmv.hpp" +#include "element_wise.hpp" +#include "gla.hpp" +#include "im2col.hpp" +#include "mmq.hpp" +#include "mmvq.hpp" +#include "norm.hpp" +#include "outprod.hpp" +#include "pad.hpp" +#include "quantize.hpp" +#include "quants.hpp" +#include "rope.hpp" +#include "set_rows.hpp" +#include "softmax.hpp" +#include "tsembd.hpp" +#include "wkv.hpp" +#include "pad_reflect_1d.hpp" + + +#endif // GGML_SYCL_BACKEND_HPP diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index 0a3883ae1ed..e5789c1eb52 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -1,345 +1,345 @@ -#include "binbcast.hpp" - -#include -#include -#include - -#include "ggml.h" - -template -static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s00,*/ int s01, int s02, int s03, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1)); - const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) / - ne3; - const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0)) % - ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - for (int i0 = i0s; i0 < ne0; - i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); - } -} - -template -static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s00,*/ int s01, int s02, int s03, - /*int s10,*/ int s11, int s12, int s13, - const sycl::nd_item<3> &item_ct1) { - - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; - - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); -} - - -template -struct bin_bcast_sycl { - template - void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00, - const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11, - const int64_t ne12, const int64_t ne13, const int64_t ne0, const int64_t ne1, const int64_t ne2, - const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03, - const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0, - const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous, - const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) { - int nr0 = ne10 / ne0; - int nr1 = ne11/ne1; - int nr2 = ne12/ne2; - int nr3 = ne13/ne3; - - int nr[4] = { nr0, nr1, nr2, nr3 }; - - // collapse dimensions until first broadcast dimension - int64_t cne[] = {ne0, ne1, ne2, ne3}; - int64_t cne0[] = {ne00, ne01, ne02, ne03}; - int64_t cne1[] = {ne10, ne11, ne12, ne13}; - size_t cnb[] = {nb0, nb1, nb2, nb3}; - size_t cnb0[] = {nb00, nb01, nb02, nb03}; - size_t cnb1[] = {nb10, nb11, nb12, nb13}; - auto collapse = [](int64_t cne[]) { - cne[0] *= cne[1]; - cne[1] = cne[2]; - cne[2] = cne[3]; - cne[3] = 1; - }; - - auto collapse_nb = [](size_t cnb[], int64_t cne[]) { - cnb[1] *= cne[1]; - cnb[2] *= cne[2]; - cnb[3] *= cne[3]; - }; - - if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) { - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb, cne); - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne); - collapse(cne0); - collapse(cne1); - } - } - } - { - int64_t ne0 = cne[0]; - int64_t ne1 = cne[1]; - int64_t ne2 = cne[2]; - int64_t ne3 = cne[3]; - - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - - size_t nb0 = cnb[0]; - size_t nb1 = cnb[1]; - size_t nb2 = cnb[2]; - size_t nb3 = cnb[3]; - - size_t nb00 = cnb0[0]; - size_t nb01 = cnb0[1]; - size_t nb02 = cnb0[2]; - size_t nb03 = cnb0[3]; - - size_t nb10 = cnb1[0]; - size_t nb11 = cnb1[1]; - size_t nb12 = cnb1[2]; - size_t nb13 = cnb1[3]; - - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - size_t s10 = nb10 / sizeof(src1_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - size_t s00 = nb00 / sizeof(src0_t); - size_t s01 = nb01 / sizeof(src0_t); - size_t s02 = nb02 / sizeof(src0_t); - size_t s03 = nb03 / sizeof(src0_t); - - GGML_UNUSED(s00); - - GGML_ASSERT(nb0 % sizeof(dst_t) == 0); - GGML_ASSERT(nb1 % sizeof(dst_t) == 0); - GGML_ASSERT(nb2 % sizeof(dst_t) == 0); - GGML_ASSERT(nb3 % sizeof(dst_t) == 0); - - GGML_ASSERT(nb00 % sizeof(src0_t) == 0); - GGML_ASSERT(nb01 % sizeof(src0_t) == 0); - GGML_ASSERT(nb02 % sizeof(src0_t) == 0); - GGML_ASSERT(nb03 % sizeof(src0_t) == 0); - - GGML_ASSERT(nb10 % sizeof(src1_t) == 0); - GGML_ASSERT(nb11 % sizeof(src1_t) == 0); - GGML_ASSERT(nb12 % sizeof(src1_t) == 0); - GGML_ASSERT(nb13 % sizeof(src1_t) == 0); - - GGML_ASSERT(s0 == 1); - GGML_ASSERT(s10 == 1); - - const int block_size = 128; - - int64_t hne0 = std::max(ne0/2LL, 1LL); - - sycl::range<3> block_dims(1, 1, 1); - block_dims[2] = std::min(hne0, block_size); - block_dims[1] = std::min( - ne1, block_size / (unsigned int)block_dims[2]); - block_dims[0] = std::min( - std::min( - ne2 * ne3, block_size / (unsigned int)block_dims[2] / - (unsigned int)block_dims[1]), - 64U); - - sycl::range<3> block_nums( - (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], - (ne1 + block_dims[1] - 1) / block_dims[1], - (hne0 + block_dims[2] - 1) / block_dims[2]); - - if (block_nums[0] > 65535) { - // this is the maximum number of blocks in z direction, fallback to 1D grid kernel - int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * - sycl::range<3>(1, 1, block_size), - sycl::range<3>(1, 1, block_size)), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast_unravel( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02, - s03, s11, s12, s13, item_ct1); - }); - } - } else { - /* - DPCT1049:16: The work-group size passed to the SYCL kernel may - exceed the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if - needed. - */ - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, - ne2, ne3, ne10, ne11, ne12, ne13, - s1, s2, s3, s01, s02, s03, s11, s12, s13, - item_ct1); - }); - } - } - } -}; - -template -inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, - ggml_tensor * dst) { - dpct::queue_ptr main_stream = ctx.stream(); - GGML_TENSOR_BINARY_OP_LOCALS - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10, - ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, - ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01, - ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, - nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), - main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { - op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02, - ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, - nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); - } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { - op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03, - ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, - nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); - } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { - op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03, - ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, - nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); - } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), - ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} - -inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); -} - -inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); -} - -inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); -} - -inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - - ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); -} - -inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - ggml_sycl_op_bin_bcast>(ctx, dst, dst->src[0], dst); -} - - -void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - ggml_sycl_op_add(ctx, dst); -} - -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - ggml_sycl_op_sub(ctx, dst); -} - -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - ggml_sycl_op_mul(ctx, dst); -} - -void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - ggml_sycl_op_div(ctx, dst); -} - -void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_repeat(ctx, dst); -} - +#include "binbcast.hpp" + +#include +#include +#include + +#include "ggml.h" + +template +static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1)); + const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) / + ne3; + const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) + + item_ct1.get_local_id(0)) % + ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; + i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); + } +} + +template +static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +} + + +template +struct bin_bcast_sycl { + template + void operator()(const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, const int64_t ne00, + const int64_t ne01, const int64_t ne02, const int64_t ne03, const int64_t ne10, const int64_t ne11, + const int64_t ne12, const int64_t ne13, const int64_t ne0, const int64_t ne1, const int64_t ne2, + const int64_t ne3, const size_t nb00, const size_t nb01, const size_t nb02, const size_t nb03, + const size_t nb10, const size_t nb11, const size_t nb12, const size_t nb13, const size_t nb0, + const size_t nb1, const size_t nb2, const size_t nb3, const bool src0_is_contiguous, + const bool src1_is_contiguous, const bool dst_is_contiguous, queue_ptr stream) { + int nr0 = ne10 / ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne[] = {ne0, ne1, ne2, ne3}; + int64_t cne0[] = {ne00, ne01, ne02, ne03}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + size_t cnb[] = {nb0, nb1, nb2, nb3}; + size_t cnb0[] = {nb00, nb01, nb02, nb03}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + if (src0_is_contiguous && src1_is_contiguous && dst_is_contiguous) { + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb, cne); + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne); + collapse(cne0); + collapse(cne1); + } + } + } + { + int64_t ne0 = cne[0]; + int64_t ne1 = cne[1]; + int64_t ne2 = cne[2]; + int64_t ne3 = cne[3]; + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb[0]; + size_t nb1 = cnb[1]; + size_t nb2 = cnb[2]; + size_t nb3 = cnb[3]; + + size_t nb00 = cnb0[0]; + size_t nb01 = cnb0[1]; + size_t nb02 = cnb0[2]; + size_t nb03 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + size_t s00 = nb00 / sizeof(src0_t); + size_t s01 = nb01 / sizeof(src0_t); + size_t s02 = nb02 / sizeof(src0_t); + size_t s03 = nb03 / sizeof(src0_t); + + GGML_UNUSED(s00); + + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); + GGML_ASSERT(nb1 % sizeof(dst_t) == 0); + GGML_ASSERT(nb2 % sizeof(dst_t) == 0); + GGML_ASSERT(nb3 % sizeof(dst_t) == 0); + + GGML_ASSERT(nb00 % sizeof(src0_t) == 0); + GGML_ASSERT(nb01 % sizeof(src0_t) == 0); + GGML_ASSERT(nb02 % sizeof(src0_t) == 0); + GGML_ASSERT(nb03 % sizeof(src0_t) == 0); + + GGML_ASSERT(nb10 % sizeof(src1_t) == 0); + GGML_ASSERT(nb11 % sizeof(src1_t) == 0); + GGML_ASSERT(nb12 % sizeof(src1_t) == 0); + GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + sycl::range<3> block_dims(1, 1, 1); + block_dims[2] = std::min(hne0, block_size); + block_dims[1] = std::min( + ne1, block_size / (unsigned int)block_dims[2]); + block_dims[0] = std::min( + std::min( + ne2 * ne3, block_size / (unsigned int)block_dims[2] / + (unsigned int)block_dims[1]), + 64U); + + sycl::range<3> block_nums( + (ne2 * ne3 + block_dims[0] - 1) / block_dims[0], + (ne1 + block_dims[1] - 1) / block_dims[1], + (hne0 + block_dims[2] - 1) / block_dims[2]); + + if (block_nums[0] > 65535) { + // this is the maximum number of blocks in z direction, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * + sycl::range<3>(1, 1, block_size), + sycl::range<3>(1, 1, block_size)), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast_unravel( + src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, s1, s2, s3, s01, s02, + s03, s11, s12, s13, item_ct1); + }); + } + } else { + /* + DPCT1049:16: The work-group size passed to the SYCL kernel may + exceed the limit. To get the device limit, query + info::device::max_work_group_size. Adjust the work-group size if + needed. + */ + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_bin_bcast(src0_dd, src1_dd, dst_dd, ne0, ne1, + ne2, ne3, ne10, ne11, ne12, ne13, + s1, s2, s3, s01, s02, s03, s11, s12, s13, + item_ct1); + }); + } + } + } +}; + +template +inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, + ggml_tensor * dst) { + dpct::queue_ptr main_stream = ctx.stream(); + GGML_TENSOR_BINARY_OP_LOCALS + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()((const float *) src0->data, (const float *) src1->data, (float *) dst->data, ne00, ne01, ne02, ne03, ne10, + ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, + ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()((const sycl::half *) src0->data, (const sycl::half *) src1->data, (sycl::half *) dst->data, ne00, ne01, + ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, + nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), + main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) { + op()((const sycl::half *) src0->data, (const float *) src1->data, (sycl::half *) dst->data, ne00, ne01, ne02, + ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, + nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + op()((const int32_t *) src0->data, (const int32_t *) src1->data, (int32_t *) dst->data, ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, + nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); + } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) { + op()((const int16_t *) src0->data, (const int16_t *) src1->data, (int16_t *) dst->data, ne00, ne01, ne02, ne03, + ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, + nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_contiguous(dst), main_stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} + +inline void ggml_sycl_op_add(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); +} + +inline void ggml_sycl_op_sub(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); +} + +inline void ggml_sycl_op_mul(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); +} + +inline void ggml_sycl_op_div(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + + ggml_sycl_op_bin_bcast>(ctx, dst->src[0], dst->src[1], dst); +} + +inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_op_bin_bcast>(ctx, dst, dst->src[0], dst); +} + + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_add(ctx, dst); +} + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_sub(ctx, dst); +} + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_mul(ctx, dst); +} + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_div(ctx, dst); +} + +void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_repeat(ctx, dst); +} + diff --git a/ggml/src/ggml-sycl/binbcast.hpp b/ggml/src/ggml-sycl/binbcast.hpp index 9cce0f053a5..50a199dabc9 100644 --- a/ggml/src/ggml-sycl/binbcast.hpp +++ b/ggml/src/ggml-sycl/binbcast.hpp @@ -1,39 +1,39 @@ -#ifndef GGML_SYCL_BINBCAST_HPP -#define GGML_SYCL_BINBCAST_HPP -#include "common.hpp" - - -static __dpct_inline__ float op_repeat(const float a, const float b) { - return b; - GGML_UNUSED(a); -} - -static __dpct_inline__ float op_add(const float a, const float b) { - return a + b; -} - -static __dpct_inline__ float op_sub(const float a, const float b) { - return a - b; -} - -static __dpct_inline__ float op_mul(const float a, const float b) { - return a * b; -} - -static __dpct_inline__ float op_div(const float a, const float b) { - return a / b; -} - -void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - - -#endif //GGML_SYCL_BINBCAST_HPP - +#ifndef GGML_SYCL_BINBCAST_HPP +#define GGML_SYCL_BINBCAST_HPP +#include "common.hpp" + + +static __dpct_inline__ float op_repeat(const float a, const float b) { + return b; + GGML_UNUSED(a); +} + +static __dpct_inline__ float op_add(const float a, const float b) { + return a + b; +} + +static __dpct_inline__ float op_sub(const float a, const float b) { + return a - b; +} + +static __dpct_inline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __dpct_inline__ float op_div(const float a, const float b) { + return a / b; +} + +void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + + +#endif //GGML_SYCL_BINBCAST_HPP + diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index 05fd5ef46c7..82657dc1600 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -1,83 +1,83 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#include "common.hpp" - -#include "ggml-backend-impl.h" -#include "ggml-impl.h" - -int get_current_device_id() { - return dpct::dev_mgr::instance().current_device_id(); -} - -void* ggml_sycl_host_malloc(size_t size) try { - if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { - return nullptr; - } - - void* ptr = nullptr; - // allow to use dpct::get_in_order_queue() for host malloc - dpct::err0 err = CHECK_TRY_ERROR( - ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue())); - - if (err != 0) { - // clear the error - GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported"); - return nullptr; - } - - return ptr; -} catch (sycl::exception const& exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -void ggml_sycl_host_free(void* ptr) try { - // allow to use dpct::get_in_order_queue() for host malloc - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); -} catch (sycl::exception const& exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -bool gpu_has_xmx(sycl::device &dev) { - return dev.has(sycl::aspect::ext_intel_matrix); -} - -int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { - const int64_t max_range = std::numeric_limits::max(); - int64_t sycl_down_blk_size = block_size; - int64_t global_range = accumulate_block_num * sycl_down_blk_size; - while(global_range > max_range) { - sycl_down_blk_size /= 2; - global_range = accumulate_block_num * sycl_down_blk_size; - } - return sycl_down_blk_size; -} - -void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { - if (extra->events[i][is] != nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is]))); - } - } - if (extra->data_device[i] != nullptr && streams.size()>0) { - ggml_sycl_set_device(i); - SYCL_CHECK( - CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); - } - } - delete extra; -} +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "common.hpp" + +#include "ggml-backend-impl.h" +#include "ggml-impl.h" + +int get_current_device_id() { + return dpct::dev_mgr::instance().current_device_id(); +} + +void* ggml_sycl_host_malloc(size_t size) try { + if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { + return nullptr; + } + + void* ptr = nullptr; + // allow to use dpct::get_in_order_queue() for host malloc + dpct::err0 err = CHECK_TRY_ERROR( + ptr = (void*)sycl::malloc_host(size, dpct::get_in_order_queue())); + + if (err != 0) { + // clear the error + GGML_LOG_ERROR("WARNING: failed to allocate %.2f MB of pinned memory: %s\n", size / 1024.0 / 1024.0, "syclGetErrorString is not supported"); + return nullptr; + } + + return ptr; +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_host_free(void* ptr) try { + // allow to use dpct::get_in_order_queue() for host malloc + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue()))); +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +bool gpu_has_xmx(sycl::device &dev) { + return dev.has(sycl::aspect::ext_intel_matrix); +} + +int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size) { + const int64_t max_range = std::numeric_limits::max(); + int64_t sycl_down_blk_size = block_size; + int64_t global_range = accumulate_block_num * sycl_down_blk_size; + while(global_range > max_range) { + sycl_down_blk_size /= 2; + global_range = accumulate_block_num * sycl_down_blk_size; + } + return sycl_down_blk_size; +} + +void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams) { + for (int i = 0; i < ggml_sycl_info().device_count; ++i) { + for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { + if (extra->events[i][is] != nullptr) { + SYCL_CHECK(CHECK_TRY_ERROR(dpct::destroy_event(extra->events[i][is]))); + } + } + if (extra->data_device[i] != nullptr && streams.size()>0) { + ggml_sycl_set_device(i); + SYCL_CHECK( + CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i])))); + } + } + delete extra; +} diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 338fa08cda2..86ec678d5cc 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -1,620 +1,620 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_COMMON_HPP -#define GGML_SYCL_COMMON_HPP - -#include -#include -#include -#include - -#include "dpct/helper.hpp" -#include "ggml-sycl.h" -#include "presets.hpp" -#include "sycl_hw.hpp" - - -#if GGML_SYCL_DNNL -#include "dnnl.hpp" -#include "dnnl_sycl.hpp" -#endif - -#define GGML_COMMON_DECL_SYCL -#define GGML_COMMON_IMPL_SYCL -/* suppress warning spam */ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wnested-anon-types" -#include "ggml-common.h" -#pragma clang diagnostic pop -#include "ggml-impl.h" - -void* ggml_sycl_host_malloc(size_t size); -void ggml_sycl_host_free(void* ptr); - - -extern int g_ggml_sycl_debug; -extern int g_ggml_sycl_disable_optimize; -extern int g_ggml_sycl_prioritize_dmmv; - -#if defined(__clang__) && __has_builtin(__builtin_expect) -// Hint the optimizer to pipeline the more likely following instruction in branches -# define LIKELY(expr) __builtin_expect(expr, true) -# define UNLIKELY(expr) __builtin_expect(expr, false) -#else -# define LIKELY(expr) (expr) -# define UNLIKELY(expr) (expr) -#endif - -#define GGML_SYCL_DEBUG(...) \ - do { \ - if (UNLIKELY(g_ggml_sycl_debug)) \ - fprintf(stderr, __VA_ARGS__); \ - } while (0) - -#define CHECK_TRY_ERROR(expr) \ - [&]() { \ - try { \ - expr; \ - return dpct::success; \ - } catch (std::exception const& e) { \ - std::cerr << e.what() << "\nException caught at file:" << __FILE__ \ - << ", line:" << __LINE__ << ", func:" << __func__ \ - << std::endl; \ - return dpct::default_error; \ - } \ - }() - - -#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP -#define VER_4VEC 610 // todo for hardward optimize. -#define VER_GEN9 700 // todo for hardward optimize. -#define VER_GEN12 1000000 // todo for hardward optimize. -#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize. - -#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares - -// define for XMX in Intel GPU -// TODO: currently, it's not used for XMX really. -#if !defined(GGML_SYCL_FORCE_MMQ) - #define SYCL_USE_XMX -#endif - -// max batch size to use MMQ kernels when tensor cores are available -#define MMQ_MAX_BATCH_SIZE 32 - -// dmmv = dequantize_mul_mat_vec -#ifndef GGML_SYCL_DMMV_X -#define GGML_SYCL_DMMV_X 32 -#endif -#ifndef GGML_SYCL_MMV_Y -#define GGML_SYCL_MMV_Y 1 -#endif - -typedef sycl::queue *queue_ptr; - -enum ggml_sycl_backend_gpu_mode { - SYCL_UNSET_GPU_MODE = -1, - SYCL_SINGLE_GPU_MODE = 0, - SYCL_MUL_GPU_MODE -}; - -static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); - -static void crash() { - int* ptr = NULL; - *ptr = 0; -} - -[[noreturn]] static void ggml_sycl_error( - const char* stmt, - const char* func, - const char* file, - const int line, - const char* msg) { - fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); - fprintf(stderr, " in function %s at %s:%d\n", func, file, line); - GGML_ABORT("SYCL error"); -} - -#define SYCL_CHECK(err) \ - do { \ - auto err_ = (err); \ - if (err_ != 0) \ - ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \ - } while (0) - -#if DPCT_COMPAT_RT_VERSION >= 11100 -#define GGML_SYCL_ASSUME(x) __builtin_assume(x) -#else -#define GGML_SYCL_ASSUME(x) -#endif // DPCT_COMPAT_RT_VERSION >= 11100 - -#ifdef GGML_SYCL_F16 -typedef sycl::half dfloat; // dequantize float -typedef sycl::half2 dfloat2; -#else -typedef float dfloat; // dequantize float -typedef sycl::float2 dfloat2; -#endif // GGML_SYCL_F16 - -#define MMVQ_MAX_BATCH_SIZE 8 - -static int g_all_sycl_device_count = -1; -static bool g_ggml_backend_sycl_buffer_type_initialized = false; - -static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = - SYCL_UNSET_GPU_MODE; - -static void* g_scratch_buffer = nullptr; -static size_t g_scratch_size = 0; // disabled by default -static size_t g_scratch_offset = 0; - -[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) { - stream_ct1 << "ERROR: ggml-sycl was compiled without support for the " - "current GPU architecture.\n"; - // __trap(); - std::exit(1); - - (void)bad_arch; // suppress unused function warning -} - -int get_current_device_id(); - -inline dpct::err0 ggml_sycl_set_device(const int device) try { - int current_device_id; - SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); - - // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, - // current_device_id=%d\n", device, current_device); - if (device == current_device_id) { - return 0; - } - - return CHECK_TRY_ERROR(dpct::select_device(device)); -} catch (sycl::exception const& exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - crash(); - std::exit(1); -} - -////////////////////// -struct optimize_feature { - bool reorder=false; -}; - -struct sycl_device_info { - int cc; // compute capability - int nsm; // number of streaming multiprocessors (CUDA) maps to the maximum - // number of compute units on a SYCL device. - // size_t smpb; // max. shared memory per block - size_t smpbo; // max. shared memory per block (with opt-in) - bool vmm; // virtual memory support - size_t total_vram; - //sycl_hw_info hw_info; \\ device id and aarch, currently not used - optimize_feature opt_feature; -}; - - -struct ggml_sycl_device_info { - int device_count; - - sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; - - std::array default_tensor_split = {}; - - int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0}; -}; - -const ggml_sycl_device_info & ggml_sycl_info(); - -struct ggml_sycl_pool { - virtual ~ggml_sycl_pool() = default; - - virtual void * alloc(size_t size, size_t * actual_size) = 0; - virtual void free(void * ptr, size_t size) = 0; -}; - -template -struct ggml_sycl_pool_alloc { - ggml_sycl_pool * pool = nullptr; - T * ptr = nullptr; - size_t actual_size = 0; - - explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) { - } - - ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) { - alloc(size); - } - - ~ggml_sycl_pool_alloc() { - if (ptr != nullptr) { - pool->free(ptr, actual_size); - } - } - - T * realloc(size_t size) { - GGML_ASSERT(pool != nullptr); - if (ptr) - pool->free(ptr, actual_size); - ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); - return ptr; - } - - // size is in number of elements - T * alloc(size_t size) { - GGML_ASSERT(pool != nullptr); - GGML_ASSERT(ptr == nullptr); - ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); - return ptr; - } - - T * alloc(ggml_sycl_pool & pool, size_t size) { - this->pool = &pool; - return alloc(size); - } - - T * get() { - return ptr; - } - - ggml_sycl_pool_alloc() = default; - ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete; - ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete; - ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete; - ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete; -}; - -// backend interface - -struct ggml_tensor_extra_gpu { - void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split - // tensors - dpct::event_ptr events[GGML_SYCL_MAX_DEVICES] - [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs - optimize_feature optimized_feature; -}; - -void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); - -namespace sycl_ex = sycl::ext::oneapi::experimental; -struct ggml_backend_sycl_context { - int device; - std::string name; - optimize_feature opt_feature; - - queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; - - explicit ggml_backend_sycl_context(int device) : - device(device), - name(GGML_SYCL_NAME + std::to_string(device)) { - opt_feature = ggml_sycl_info().devices[device].opt_feature; - } - - queue_ptr stream(int device, int stream) { - if (qptrs[device][stream] == nullptr) { - qptrs[device][stream] = &(dpct::get_device(device).default_queue()); - } - return qptrs[device][stream]; - } - - queue_ptr stream() { - return stream(device, 0); - } - -#if GGML_SYCL_DNNL - dnnl::engine make_engine(sycl::queue* q) { - // Get the device associated with the queue - sycl::device dev = q->get_device(); - // Get the context associated with the queue - sycl::context ctx = q->get_context(); - const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx); - return eng; - } - - std::unordered_map stream_map; - std::unordered_map engine_map; - dnnl::stream stream_dnnl(int device, int _stream) { - auto q = stream(device, _stream); - return stream_dnnl(q); - } - dnnl::engine engine_dnnl(sycl::queue* qptr) { - auto it = engine_map.find(qptr); - if (it == engine_map.end()) { - auto eng = make_engine(qptr); - engine_map[qptr] = eng; - return eng; - } - else - { - return it->second; - } - } - dnnl::stream stream_dnnl(sycl::queue* qptr) { - auto it = stream_map.find(qptr); - if (it == stream_map.end()) { - auto eng = engine_dnnl(qptr); - auto stream = dnnl::sycl_interop::make_stream(eng, *qptr); - stream_map[qptr] = stream; - return stream; - } - else - { - return it->second; - } - } - dnnl::stream stream_dnnl() { - return stream_dnnl(device, 0); - } - dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md, - const dnnl::engine & eng, const queue_ptr q) { - ggml_sycl_pool_alloc * pool; - auto it = scratchpad_map.find(q); - if (it == scratchpad_map.end()) { - scratchpad_map[q] = std::make_unique>(this->pool()); - pool = scratchpad_map[q].get(); - } else { - pool = it->second.get(); - } - - size_t scratchpad_size = scratchpad_md.get_size(); - if (scratchpad_size > pool->actual_size) { - pool->realloc(scratchpad_size); - } - void * mem_ptr = pool->get(); - return dnnl::memory(scratchpad_md, eng, mem_ptr); - } -#endif - - // pool - std::unique_ptr pools[GGML_SYCL_MAX_DEVICES]; - std::unordered_map>> scratchpad_map; - - std::unique_ptr host_pools[GGML_SYCL_MAX_DEVICES]; - - static std::unique_ptr new_pool_for_device(queue_ptr qptr, int device); - - static std::unique_ptr new_pool_for_host(queue_ptr qptr, int device); - - ggml_sycl_pool & pool(int device) { - if (pools[device] == nullptr) { - pools[device] = new_pool_for_device(stream(device,0), device); - } - return *pools[device]; - } - - ggml_sycl_pool & pool() { - return pool(device); - } - -#ifdef GGML_SYCL_GRAPH - std::unique_ptr> exec_graph = nullptr; -#endif - - ggml_sycl_pool & host_pool(int device) { - if (host_pools[device] == nullptr) { - host_pools[device] = new_pool_for_host(stream(device, 0), device); - } - return *host_pools[device]; - } - - ggml_sycl_pool & host_pool() { return host_pool(device); } -}; - -// common device functions - -static __dpct_inline__ float warp_reduce_sum(float x, - const sycl::nd_item<3>& item_ct1) { -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); - } - return x; -} - -static __dpct_inline__ sycl::float2 -warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) { -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), - mask); - a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), - mask); - } - return a; -} - -template -static __dpct_inline__ int warp_reduce_sum(int x) { - return sycl::reduce_over_group( - sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>()); -} - -template -static __dpct_inline__ float warp_reduce_sum(float x) { -#pragma unroll - for (int offset = width / 2; offset > 0; offset >>= 1) { - x += dpct::permute_sub_group_by_xor( - sycl::ext::oneapi::this_work_item::get_sub_group(), x, offset, width); - } - return x; -} - -template -static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) { -#pragma unroll - for (int offset = width / 2; offset > 0; offset >>= 1) { - a.x() += dpct::permute_sub_group_by_xor( - sycl::ext::oneapi::this_work_item::get_sub_group(), a.x(), offset, - width); - a.y() += dpct::permute_sub_group_by_xor( - sycl::ext::oneapi::this_work_item::get_sub_group(), a.y(), offset, - width); - } - return a; -} - -template -static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) { -#pragma unroll - for (int offset = width / 2; offset > 0; offset >>= 1) { - a = a + dpct::permute_sub_group_by_xor( - sycl::ext::oneapi::this_work_item::get_sub_group(), a, offset, - width); - } - return a; -} - -static constexpr int ggml_sycl_get_physical_warp_size() { - // todo: for old iGPU + dGPU case, need to be changed. - return WARP_SIZE; -} - -template -static __dpct_inline__ float warp_reduce_max(float x) { -#pragma unroll - for (int offset = width / 2; offset > 0; offset >>= 1) { - x = sycl::fmax(x, dpct::permute_sub_group_by_xor( - sycl::ext::oneapi::this_work_item::get_sub_group(), x, - offset, width)); - } - return x; -} - -static __dpct_inline__ float warp_reduce_max(float x, - const sycl::nd_item<3>& item_ct1) { -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - x = sycl::fmax(x, dpct::permute_sub_group_by_xor( - item_ct1.get_sub_group(), x, mask)); - } - return x; -} - -/* Helper for Computing the linear offset of a ggml_tensor given -per-dimension sizes, strides, and indices */ -template -__dpct_inline__ size_t calculate_offset(const std::array & strides, const std::array & indices) { - size_t offset = 0; -#pragma unroll - for (int i = 0; i < N; i++) { - auto index_i = indices[i]; - offset += strides[i] * index_i; - } - return offset; -} - -// Helper for vec loading aligned data -template -inline sycl::vec vec_aligned_load(const Tp* aligned_ptr) { - return *reinterpret_cast*>(aligned_ptr); -} - -// Helper for accessing pointers with no warnings -template -static __dpct_inline__ Tp* get_pointer(sycl::local_accessor acc) { - return acc.template get_multi_ptr().get(); -} - -int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size); - -constexpr size_t ceil_div(const size_t m, const size_t n) { - return (m + n - 1) / n; -} - -bool gpu_has_xmx(sycl::device &dev); - -template std::string debug_get_array_str(const std::string & prefix, const T array[N]) { - if (LIKELY(!g_ggml_sycl_debug)) { - return ""; - } - std::stringstream ss; - ss << prefix << "=["; - for (std::size_t i = 0; i < N - 1; ++i) { - ss << array[i] << ", "; - } - if constexpr (N > 0) { - ss << array[N - 1]; - } - ss << "]"; - return ss.str(); -} - -inline std::string debug_get_tensor_str(const std::string &prefix, - const ggml_tensor *tensor, const std::string &suffix = "") { - std::stringstream ss; - if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); } - ss << prefix.c_str() << "="; - if (tensor) { - ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type); - ss << debug_get_array_str(";ne", tensor->ne); - ss << debug_get_array_str(";nb", tensor->nb); - - if (!ggml_is_contiguous(tensor)) { ss << ";strided"; } - if (ggml_is_permuted(tensor)) { ss << ";permuted"; } - } else { - ss << "nullptr"; - } - ss << suffix; - return ss.str(); -} - -// Use scope_op_debug_print to log operations coming from running a model -struct scope_op_debug_print { - // Use string_views to avoid the cost of creating a string and concatenating them - // string_views must be alive for as long as the object is alive - // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible - scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst, - std::size_t num_src, const std::string_view & suffix = "") : - func(func), - func_suffix(func_suffix) { - if (LIKELY(!g_ggml_sycl_debug)) { - return; - } - GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data()); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str()); - if (dst) { - for (std::size_t i = 0; i < num_src; ++i) { - GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str()); - } - } - GGML_SYCL_DEBUG("%s\n", suffix.data()); - } - - scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src, - const std::string_view & suffix = "") : - scope_op_debug_print(func, "", dst, num_src, suffix) {} - - ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); } - - private: - std::string_view func; - std::string_view func_suffix; -}; - -static __dpct_inline__ float get_alibi_slope(const float max_bias, - const uint32_t h, - const uint32_t n_head_log2, - const float m0, - const float m1) { - if (max_bias <= 0.0f) { - return 1.0f; - } - const float base = h < n_head_log2 ? m0 : m1; - const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - - return dpct::pow(base, exph); -} - -#endif // GGML_SYCL_COMMON_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_COMMON_HPP +#define GGML_SYCL_COMMON_HPP + +#include +#include +#include +#include + +#include "dpct/helper.hpp" +#include "ggml-sycl.h" +#include "presets.hpp" +#include "sycl_hw.hpp" + + +#if GGML_SYCL_DNNL +#include "dnnl.hpp" +#include "dnnl_sycl.hpp" +#endif + +#define GGML_COMMON_DECL_SYCL +#define GGML_COMMON_IMPL_SYCL +/* suppress warning spam */ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnested-anon-types" +#include "ggml-common.h" +#pragma clang diagnostic pop +#include "ggml-impl.h" + +void* ggml_sycl_host_malloc(size_t size); +void ggml_sycl_host_free(void* ptr); + + +extern int g_ggml_sycl_debug; +extern int g_ggml_sycl_disable_optimize; +extern int g_ggml_sycl_prioritize_dmmv; + +#if defined(__clang__) && __has_builtin(__builtin_expect) +// Hint the optimizer to pipeline the more likely following instruction in branches +# define LIKELY(expr) __builtin_expect(expr, true) +# define UNLIKELY(expr) __builtin_expect(expr, false) +#else +# define LIKELY(expr) (expr) +# define UNLIKELY(expr) (expr) +#endif + +#define GGML_SYCL_DEBUG(...) \ + do { \ + if (UNLIKELY(g_ggml_sycl_debug)) \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) + +#define CHECK_TRY_ERROR(expr) \ + [&]() { \ + try { \ + expr; \ + return dpct::success; \ + } catch (std::exception const& e) { \ + std::cerr << e.what() << "\nException caught at file:" << __FILE__ \ + << ", line:" << __LINE__ << ", func:" << __func__ \ + << std::endl; \ + return dpct::default_error; \ + } \ + }() + + +#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP +#define VER_4VEC 610 // todo for hardward optimize. +#define VER_GEN9 700 // todo for hardward optimize. +#define VER_GEN12 1000000 // todo for hardward optimize. +#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize. + +#define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares + +// define for XMX in Intel GPU +// TODO: currently, it's not used for XMX really. +#if !defined(GGML_SYCL_FORCE_MMQ) + #define SYCL_USE_XMX +#endif + +// max batch size to use MMQ kernels when tensor cores are available +#define MMQ_MAX_BATCH_SIZE 32 + +// dmmv = dequantize_mul_mat_vec +#ifndef GGML_SYCL_DMMV_X +#define GGML_SYCL_DMMV_X 32 +#endif +#ifndef GGML_SYCL_MMV_Y +#define GGML_SYCL_MMV_Y 1 +#endif + +typedef sycl::queue *queue_ptr; + +enum ggml_sycl_backend_gpu_mode { + SYCL_UNSET_GPU_MODE = -1, + SYCL_SINGLE_GPU_MODE = 0, + SYCL_MUL_GPU_MODE +}; + +static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +static void crash() { + int* ptr = NULL; + *ptr = 0; +} + +[[noreturn]] static void ggml_sycl_error( + const char* stmt, + const char* func, + const char* file, + const int line, + const char* msg) { + fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); + fprintf(stderr, " in function %s at %s:%d\n", func, file, line); + GGML_ABORT("SYCL error"); +} + +#define SYCL_CHECK(err) \ + do { \ + auto err_ = (err); \ + if (err_ != 0) \ + ggml_sycl_error(#err, __func__, __FILE__, __LINE__, "Exception caught in this line of code."); \ + } while (0) + +#if DPCT_COMPAT_RT_VERSION >= 11100 +#define GGML_SYCL_ASSUME(x) __builtin_assume(x) +#else +#define GGML_SYCL_ASSUME(x) +#endif // DPCT_COMPAT_RT_VERSION >= 11100 + +#ifdef GGML_SYCL_F16 +typedef sycl::half dfloat; // dequantize float +typedef sycl::half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef sycl::float2 dfloat2; +#endif // GGML_SYCL_F16 + +#define MMVQ_MAX_BATCH_SIZE 8 + +static int g_all_sycl_device_count = -1; +static bool g_ggml_backend_sycl_buffer_type_initialized = false; + +static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = + SYCL_UNSET_GPU_MODE; + +static void* g_scratch_buffer = nullptr; +static size_t g_scratch_size = 0; // disabled by default +static size_t g_scratch_offset = 0; + +[[noreturn]] static inline void bad_arch(const sycl::stream& stream_ct1) { + stream_ct1 << "ERROR: ggml-sycl was compiled without support for the " + "current GPU architecture.\n"; + // __trap(); + std::exit(1); + + (void)bad_arch; // suppress unused function warning +} + +int get_current_device_id(); + +inline dpct::err0 ggml_sycl_set_device(const int device) try { + int current_device_id; + SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); + + // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, + // current_device_id=%d\n", device, current_device); + if (device == current_device_id) { + return 0; + } + + return CHECK_TRY_ERROR(dpct::select_device(device)); +} catch (sycl::exception const& exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + crash(); + std::exit(1); +} + +////////////////////// +struct optimize_feature { + bool reorder=false; +}; + +struct sycl_device_info { + int cc; // compute capability + int nsm; // number of streaming multiprocessors (CUDA) maps to the maximum + // number of compute units on a SYCL device. + // size_t smpb; // max. shared memory per block + size_t smpbo; // max. shared memory per block (with opt-in) + bool vmm; // virtual memory support + size_t total_vram; + //sycl_hw_info hw_info; \\ device id and aarch, currently not used + optimize_feature opt_feature; +}; + + +struct ggml_sycl_device_info { + int device_count; + + sycl_device_info devices[GGML_SYCL_MAX_DEVICES] = {}; + + std::array default_tensor_split = {}; + + int max_work_group_sizes[GGML_SYCL_MAX_DEVICES] = {0}; +}; + +const ggml_sycl_device_info & ggml_sycl_info(); + +struct ggml_sycl_pool { + virtual ~ggml_sycl_pool() = default; + + virtual void * alloc(size_t size, size_t * actual_size) = 0; + virtual void free(void * ptr, size_t size) = 0; +}; + +template +struct ggml_sycl_pool_alloc { + ggml_sycl_pool * pool = nullptr; + T * ptr = nullptr; + size_t actual_size = 0; + + explicit ggml_sycl_pool_alloc(ggml_sycl_pool & pool) : pool(&pool) { + } + + ggml_sycl_pool_alloc(ggml_sycl_pool & pool, size_t size) : pool(&pool) { + alloc(size); + } + + ~ggml_sycl_pool_alloc() { + if (ptr != nullptr) { + pool->free(ptr, actual_size); + } + } + + T * realloc(size_t size) { + GGML_ASSERT(pool != nullptr); + if (ptr) + pool->free(ptr, actual_size); + ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + // size is in number of elements + T * alloc(size_t size) { + GGML_ASSERT(pool != nullptr); + GGML_ASSERT(ptr == nullptr); + ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + T * alloc(ggml_sycl_pool & pool, size_t size) { + this->pool = &pool; + return alloc(size); + } + + T * get() { + return ptr; + } + + ggml_sycl_pool_alloc() = default; + ggml_sycl_pool_alloc(const ggml_sycl_pool_alloc &) = delete; + ggml_sycl_pool_alloc(ggml_sycl_pool_alloc &&) = delete; + ggml_sycl_pool_alloc& operator=(const ggml_sycl_pool_alloc &) = delete; + ggml_sycl_pool_alloc& operator=(ggml_sycl_pool_alloc &&) = delete; +}; + +// backend interface + +struct ggml_tensor_extra_gpu { + void* data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split + // tensors + dpct::event_ptr events[GGML_SYCL_MAX_DEVICES] + [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs + optimize_feature optimized_feature; +}; + +void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); + +namespace sycl_ex = sycl::ext::oneapi::experimental; +struct ggml_backend_sycl_context { + int device; + std::string name; + optimize_feature opt_feature; + + queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } }; + + explicit ggml_backend_sycl_context(int device) : + device(device), + name(GGML_SYCL_NAME + std::to_string(device)) { + opt_feature = ggml_sycl_info().devices[device].opt_feature; + } + + queue_ptr stream(int device, int stream) { + if (qptrs[device][stream] == nullptr) { + qptrs[device][stream] = &(dpct::get_device(device).default_queue()); + } + return qptrs[device][stream]; + } + + queue_ptr stream() { + return stream(device, 0); + } + +#if GGML_SYCL_DNNL + dnnl::engine make_engine(sycl::queue* q) { + // Get the device associated with the queue + sycl::device dev = q->get_device(); + // Get the context associated with the queue + sycl::context ctx = q->get_context(); + const dnnl::engine eng = dnnl::sycl_interop::make_engine(dev, ctx); + return eng; + } + + std::unordered_map stream_map; + std::unordered_map engine_map; + dnnl::stream stream_dnnl(int device, int _stream) { + auto q = stream(device, _stream); + return stream_dnnl(q); + } + dnnl::engine engine_dnnl(sycl::queue* qptr) { + auto it = engine_map.find(qptr); + if (it == engine_map.end()) { + auto eng = make_engine(qptr); + engine_map[qptr] = eng; + return eng; + } + else + { + return it->second; + } + } + dnnl::stream stream_dnnl(sycl::queue* qptr) { + auto it = stream_map.find(qptr); + if (it == stream_map.end()) { + auto eng = engine_dnnl(qptr); + auto stream = dnnl::sycl_interop::make_stream(eng, *qptr); + stream_map[qptr] = stream; + return stream; + } + else + { + return it->second; + } + } + dnnl::stream stream_dnnl() { + return stream_dnnl(device, 0); + } + dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md, + const dnnl::engine & eng, const queue_ptr q) { + ggml_sycl_pool_alloc * pool; + auto it = scratchpad_map.find(q); + if (it == scratchpad_map.end()) { + scratchpad_map[q] = std::make_unique>(this->pool()); + pool = scratchpad_map[q].get(); + } else { + pool = it->second.get(); + } + + size_t scratchpad_size = scratchpad_md.get_size(); + if (scratchpad_size > pool->actual_size) { + pool->realloc(scratchpad_size); + } + void * mem_ptr = pool->get(); + return dnnl::memory(scratchpad_md, eng, mem_ptr); + } +#endif + + // pool + std::unique_ptr pools[GGML_SYCL_MAX_DEVICES]; + std::unordered_map>> scratchpad_map; + + std::unique_ptr host_pools[GGML_SYCL_MAX_DEVICES]; + + static std::unique_ptr new_pool_for_device(queue_ptr qptr, int device); + + static std::unique_ptr new_pool_for_host(queue_ptr qptr, int device); + + ggml_sycl_pool & pool(int device) { + if (pools[device] == nullptr) { + pools[device] = new_pool_for_device(stream(device,0), device); + } + return *pools[device]; + } + + ggml_sycl_pool & pool() { + return pool(device); + } + +#ifdef GGML_SYCL_GRAPH + std::unique_ptr> exec_graph = nullptr; +#endif + + ggml_sycl_pool & host_pool(int device) { + if (host_pools[device] == nullptr) { + host_pools[device] = new_pool_for_host(stream(device, 0), device); + } + return *host_pools[device]; + } + + ggml_sycl_pool & host_pool() { return host_pool(device); } +}; + +// common device functions + +static __dpct_inline__ float warp_reduce_sum(float x, + const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask); + } + return x; +} + +static __dpct_inline__ sycl::float2 +warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), + mask); + a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), + mask); + } + return a; +} + +template +static __dpct_inline__ int warp_reduce_sum(int x) { + return sycl::reduce_over_group( + sycl::ext::oneapi::this_work_item::get_sub_group(), x, sycl::plus<>()); +} + +template +static __dpct_inline__ float warp_reduce_sum(float x) { +#pragma unroll + for (int offset = width / 2; offset > 0; offset >>= 1) { + x += dpct::permute_sub_group_by_xor( + sycl::ext::oneapi::this_work_item::get_sub_group(), x, offset, width); + } + return x; +} + +template +static __dpct_inline__ sycl::float2 warp_reduce_sum(sycl::float2 a) { +#pragma unroll + for (int offset = width / 2; offset > 0; offset >>= 1) { + a.x() += dpct::permute_sub_group_by_xor( + sycl::ext::oneapi::this_work_item::get_sub_group(), a.x(), offset, + width); + a.y() += dpct::permute_sub_group_by_xor( + sycl::ext::oneapi::this_work_item::get_sub_group(), a.y(), offset, + width); + } + return a; +} + +template +static __dpct_inline__ sycl::half2 warp_reduce_sum(sycl::half2 a) { +#pragma unroll + for (int offset = width / 2; offset > 0; offset >>= 1) { + a = a + dpct::permute_sub_group_by_xor( + sycl::ext::oneapi::this_work_item::get_sub_group(), a, offset, + width); + } + return a; +} + +static constexpr int ggml_sycl_get_physical_warp_size() { + // todo: for old iGPU + dGPU case, need to be changed. + return WARP_SIZE; +} + +template +static __dpct_inline__ float warp_reduce_max(float x) { +#pragma unroll + for (int offset = width / 2; offset > 0; offset >>= 1) { + x = sycl::fmax(x, dpct::permute_sub_group_by_xor( + sycl::ext::oneapi::this_work_item::get_sub_group(), x, + offset, width)); + } + return x; +} + +static __dpct_inline__ float warp_reduce_max(float x, + const sycl::nd_item<3>& item_ct1) { +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + x = sycl::fmax(x, dpct::permute_sub_group_by_xor( + item_ct1.get_sub_group(), x, mask)); + } + return x; +} + +/* Helper for Computing the linear offset of a ggml_tensor given +per-dimension sizes, strides, and indices */ +template +__dpct_inline__ size_t calculate_offset(const std::array & strides, const std::array & indices) { + size_t offset = 0; +#pragma unroll + for (int i = 0; i < N; i++) { + auto index_i = indices[i]; + offset += strides[i] * index_i; + } + return offset; +} + +// Helper for vec loading aligned data +template +inline sycl::vec vec_aligned_load(const Tp* aligned_ptr) { + return *reinterpret_cast*>(aligned_ptr); +} + +// Helper for accessing pointers with no warnings +template +static __dpct_inline__ Tp* get_pointer(sycl::local_accessor acc) { + return acc.template get_multi_ptr().get(); +} + +int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size); + +constexpr size_t ceil_div(const size_t m, const size_t n) { + return (m + n - 1) / n; +} + +bool gpu_has_xmx(sycl::device &dev); + +template std::string debug_get_array_str(const std::string & prefix, const T array[N]) { + if (LIKELY(!g_ggml_sycl_debug)) { + return ""; + } + std::stringstream ss; + ss << prefix << "=["; + for (std::size_t i = 0; i < N - 1; ++i) { + ss << array[i] << ", "; + } + if constexpr (N > 0) { + ss << array[N - 1]; + } + ss << "]"; + return ss.str(); +} + +inline std::string debug_get_tensor_str(const std::string &prefix, + const ggml_tensor *tensor, const std::string &suffix = "") { + std::stringstream ss; + if (LIKELY(!g_ggml_sycl_debug)) { return ss.str(); } + ss << prefix.c_str() << "="; + if (tensor) { + ss << "'" << tensor->name << "':type=" << ggml_type_name(tensor->type); + ss << debug_get_array_str(";ne", tensor->ne); + ss << debug_get_array_str(";nb", tensor->nb); + + if (!ggml_is_contiguous(tensor)) { ss << ";strided"; } + if (ggml_is_permuted(tensor)) { ss << ";permuted"; } + } else { + ss << "nullptr"; + } + ss << suffix; + return ss.str(); +} + +// Use scope_op_debug_print to log operations coming from running a model +struct scope_op_debug_print { + // Use string_views to avoid the cost of creating a string and concatenating them + // string_views must be alive for as long as the object is alive + // scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible + scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst, + std::size_t num_src, const std::string_view & suffix = "") : + func(func), + func_suffix(func_suffix) { + if (LIKELY(!g_ggml_sycl_debug)) { + return; + } + GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data()); + GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" dst", dst).c_str()); + if (dst) { + for (std::size_t i = 0; i < num_src; ++i) { + GGML_SYCL_DEBUG("%s", debug_get_tensor_str("\tsrc" + std::to_string(i), dst->src[i]).c_str()); + } + } + GGML_SYCL_DEBUG("%s\n", suffix.data()); + } + + scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src, + const std::string_view & suffix = "") : + scope_op_debug_print(func, "", dst, num_src, suffix) {} + + ~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); } + + private: + std::string_view func; + std::string_view func_suffix; +}; + +static __dpct_inline__ float get_alibi_slope(const float max_bias, + const uint32_t h, + const uint32_t n_head_log2, + const float m0, + const float m1) { + if (max_bias <= 0.0f) { + return 1.0f; + } + const float base = h < n_head_log2 ? m0 : m1; + const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + return dpct::pow(base, exph); +} + +#endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index c7683650483..d8e22a9aea9 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -1,191 +1,191 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#include "concat.hpp" -#include "common.hpp" - -static void concat_f32_dim0(const float *x, const float *y, float *dst, - const int ne0, const int ne00, - const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (nidx >= ne0) { - return; - } - // operation - int offset_dst = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (nidx < ne00) { // src0 - int offset_src = nidx + item_ct1.get_group(1) * ne00 + - item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1); - dst[offset_dst] = x[offset_src]; - } else { - int offset_src = - nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) + - item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1); - dst[offset_dst] = y[offset_src]; - } -} - -static void concat_f32_dim1(const float *x, const float *y, float *dst, - const int ne0, const int ne01, - const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (nidx >= ne0) { - return; - } - // operation - int offset_dst = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (item_ct1.get_group(1) < (size_t) ne01) { // src0 - int offset_src = - nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01; - dst[offset_dst] = x[offset_src]; - } else { - int offset_src = - nidx + (item_ct1.get_group(1) - ne01) * ne0 + - item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01); - dst[offset_dst] = y[offset_src]; - } -} - -static void concat_f32_dim2(const float *x, const float *y, float *dst, - const int ne0, const int ne02, - const sycl::nd_item<3> &item_ct1) { - int nidx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (nidx >= ne0) { - return; - } - // operation - int offset_dst = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - if (item_ct1.get_group(0) < (size_t) ne02) { // src0 - int offset_src = nidx + item_ct1.get_group(1) * ne0 + - item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); - dst[offset_dst] = x[offset_src]; - } else { - int offset_src = - nidx + item_ct1.get_group(1) * ne0 + - (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); - dst[offset_dst] = y[offset_src]; - } -} - -static void concat_f32_sycl(const float *x, const float *y, float *dst, - int ne00, int ne01, int ne02, int ne0, int ne1, - int ne2, int dim, queue_ptr stream) { - int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE; - sycl::range<3> gridDim(ne2, ne1, num_blocks); - switch (dim) { - case 0: - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); - }); - break; - case 1: - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); - }); - break; - // dim >=2 will be dispatched to the default path - default: - stream->parallel_for( - sycl::nd_range<3>(gridDim * - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); - }); - break; - } -} - -// non-contiguous kernel (slow) -static void concat_f32_sycl_non_cont( - queue_ptr stream, const char *src0, const char *src1, char *dst, - int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00, - uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/, - int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10, - uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1, - int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2, - uint64_t nb3, int32_t dim) { - sycl::range<3> gridDim(ne3, ne2, ne1); - stream->parallel_for(sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - int64_t i3 = item_ct1.get_group(0); - int64_t i2 = item_ct1.get_group(1); - int64_t i1 = item_ct1.get_group(2); - - int64_t o[4] = { 0, 0, 0, 0 }; - o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); - - const float * x; - - for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) { - if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00); - } else { - x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 + - (i0 - o[0]) * nb10); - } - - float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0); - - *y = *x; - } - }); -} - -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - queue_ptr stream = ctx.stream(); - - const int32_t dim = ((int32_t *) dst->op_params)[0]; - - if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { - const float * src0_d = (const float *) src0->data; - const float * src1_d = (const float *) src1->data; - - float * dst_d = (float *) dst->data; - - if (dim != 3) { - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), - dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0], - dst->ne[1], dst->ne[2], dim, stream); - } - } else { - const size_t size0 = ggml_nbytes(src0); - const size_t size1 = ggml_nbytes(src1); - - SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); - SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); - } - } else { - concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], - src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], - src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], - dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); - } -} +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "concat.hpp" +#include "common.hpp" + +static void concat_f32_dim0(const float *x, const float *y, float *dst, + const int ne0, const int ne00, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (nidx < ne00) { // src0 + int offset_src = nidx + item_ct1.get_group(1) * ne00 + + item_ct1.get_group(0) * ne00 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx - ne00 + item_ct1.get_group(1) * (ne0 - ne00) + + item_ct1.get_group(0) * (ne0 - ne00) * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_dim1(const float *x, const float *y, float *dst, + const int ne0, const int ne01, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(1) < (size_t) ne01) { // src0 + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + item_ct1.get_group(0) * ne0 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + (item_ct1.get_group(1) - ne01) * ne0 + + item_ct1.get_group(0) * ne0 * (item_ct1.get_group_range(1) - ne01); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_dim2(const float *x, const float *y, float *dst, + const int ne0, const int ne02, + const sycl::nd_item<3> &item_ct1) { + int nidx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + if (item_ct1.get_group(0) < (size_t) ne02) { // src0 + int offset_src = nidx + item_ct1.get_group(1) * ne0 + + item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + item_ct1.get_group(1) * ne0 + + (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1); + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_sycl(const float *x, const float *y, float *dst, + int ne00, int ne01, int ne02, int ne0, int ne1, + int ne2, int dim, queue_ptr stream) { + int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE; + sycl::range<3> gridDim(ne2, ne1, num_blocks); + switch (dim) { + case 0: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1); + }); + break; + case 1: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1); + }); + break; + // dim >=2 will be dispatched to the default path + default: + stream->parallel_for( + sycl::nd_range<3>(gridDim * + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1); + }); + break; + } +} + +// non-contiguous kernel (slow) +static void concat_f32_sycl_non_cont( + queue_ptr stream, const char *src0, const char *src1, char *dst, + int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00, + uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/, + int64_t /*ne11*/, int64_t /*ne12*/, int64_t /*ne13*/, uint64_t nb10, + uint64_t nb11, uint64_t nb12, uint64_t nb13, int64_t ne0, int64_t ne1, + int64_t ne2, int64_t ne3, uint64_t nb0, uint64_t nb1, uint64_t nb2, + uint64_t nb3, int32_t dim) { + sycl::range<3> gridDim(ne3, ne2, ne1); + stream->parallel_for(sycl::nd_range<3>(gridDim, sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + int64_t i3 = item_ct1.get_group(0); + int64_t i2 = item_ct1.get_group(1); + int64_t i1 = item_ct1.get_group(2); + + int64_t o[4] = { 0, 0, 0, 0 }; + o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03)); + + const float * x; + + for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) { + if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { + x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00); + } else { + x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 + + (i0 - o[0]) * nb10); + } + + float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0); + + *y = *x; + } + }); +} + +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + queue_ptr stream = ctx.stream(); + + const int32_t dim = ((int32_t *) dst->op_params)[0]; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + + float * dst_d = (float *) dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), + dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0], + dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait())); + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait())); + } + } else { + concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], + src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], + dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim); + } +} diff --git a/ggml/src/ggml-sycl/concat.hpp b/ggml/src/ggml-sycl/concat.hpp index e5cb7314c9f..4ea92a35538 100644 --- a/ggml/src/ggml-sycl/concat.hpp +++ b/ggml/src/ggml-sycl/concat.hpp @@ -1,20 +1,20 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_CONCAT_HPP -#define GGML_SYCL_CONCAT_HPP - -#include "common.hpp" - -void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst); - -#endif // GGML_SYCL_CONCAT_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONCAT_HPP +#define GGML_SYCL_CONCAT_HPP + +#include "common.hpp" + +void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst); + +#endif // GGML_SYCL_CONCAT_HPP diff --git a/ggml/src/ggml-sycl/conv.cpp b/ggml/src/ggml-sycl/conv.cpp index 475bd34a25d..61cc00cb758 100644 --- a/ggml/src/ggml-sycl/conv.cpp +++ b/ggml/src/ggml-sycl/conv.cpp @@ -1,101 +1,101 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#include "conv.hpp" - -static void conv_transpose_1d_kernel( - const int s0, const int output_size, - const int src0_ne0, const int src0_ne1, const int src0_ne2, - const int src1_ne0, const int dst_ne0, - const float * src0, const float * src1, float * dst, - const sycl::nd_item<3> &item_ct1) { - int global_index = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (global_index >= output_size) { - return; - } - - int out_index = global_index / dst_ne0; - - float accumulator = 0; - - for (int c = 0; c < src0_ne2; c++) { - int idx = global_index % dst_ne0; - - int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0); - int input_offset = src1_ne0 * c; - - for (int i = 0; i < src1_ne0; i++) { - if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) { - continue; - } - int weight_idx = idx - i*s0; - - float kernel_weight = src0[kernel_offset + weight_idx]; - float input_value = src1[input_offset+i]; - - accumulator += kernel_weight * input_value; - } - } - dst[global_index] = accumulator; -} - -static void conv_transpose_1d_f32_f32_sycl( - const int s0, const int output_size, - const int src0_ne0, const int src0_ne1, const int src0_ne2, - const int src1_ne0, const int dst_ne0, - const float *src0, const float *src1, float *dst, - const queue_ptr& stream) { - - const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE; - const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE); - const sycl::range<3> block_nums(1, 1, num_blocks); - stream->parallel_for( - sycl::nd_range<3>( - block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - conv_transpose_1d_kernel( - s0, output_size, - src0_ne0, src0_ne1, src0_ne2, - src1_ne0, dst_ne0, - src0, src1, dst, item_ct1); - }); -} - -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - const ggml_tensor *src0 = dst->src[0]; - const ggml_tensor *src1 = dst->src[1]; - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - - float * dst_d = (float *)dst->data; - dpct::queue_ptr stream = ctx.stream(); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - - const int32_t * opts = (const int32_t *)dst->op_params; - - const int s0 = opts[0]; - - const int64_t output_size = ggml_nelements(dst); - - conv_transpose_1d_f32_f32_sycl(s0, output_size, - src0->ne[0], src0->ne[1], src0->ne[2], - src1->ne[0], dst->ne[0], - src0_d, src1_d, dst_d, stream); -} - +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "conv.hpp" + +static void conv_transpose_1d_kernel( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float * src0, const float * src1, float * dst, + const sycl::nd_item<3> &item_ct1) { + int global_index = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (global_index >= output_size) { + return; + } + + int out_index = global_index / dst_ne0; + + float accumulator = 0; + + for (int c = 0; c < src0_ne2; c++) { + int idx = global_index % dst_ne0; + + int kernel_offset = (src0_ne0 * src0_ne1 * c) + (out_index * src0_ne0); + int input_offset = src1_ne0 * c; + + for (int i = 0; i < src1_ne0; i++) { + if (!(idx >= i*s0 && idx < i*s0 + src0_ne0)) { + continue; + } + int weight_idx = idx - i*s0; + + float kernel_weight = src0[kernel_offset + weight_idx]; + float input_value = src1[input_offset+i]; + + accumulator += kernel_weight * input_value; + } + } + dst[global_index] = accumulator; +} + +static void conv_transpose_1d_f32_f32_sycl( + const int s0, const int output_size, + const int src0_ne0, const int src0_ne1, const int src0_ne2, + const int src1_ne0, const int dst_ne0, + const float *src0, const float *src1, float *dst, + const queue_ptr& stream) { + + const int num_blocks = (output_size + SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE - 1) / SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for( + sycl::nd_range<3>( + block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv_transpose_1d_kernel( + s0, output_size, + src0_ne0, src0_ne1, src0_ne2, + src1_ne0, dst_ne0, + src0, src1, dst, item_ct1); + }); +} + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + const ggml_tensor *src0 = dst->src[0]; + const ggml_tensor *src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + + float * dst_d = (float *)dst->data; + dpct::queue_ptr stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const int32_t * opts = (const int32_t *)dst->op_params; + + const int s0 = opts[0]; + + const int64_t output_size = ggml_nelements(dst); + + conv_transpose_1d_f32_f32_sycl(s0, output_size, + src0->ne[0], src0->ne[1], src0->ne[2], + src1->ne[0], dst->ne[0], + src0_d, src1_d, dst_d, stream); +} + diff --git a/ggml/src/ggml-sycl/conv.hpp b/ggml/src/ggml-sycl/conv.hpp index f9e60dc7580..651a12dd99a 100644 --- a/ggml/src/ggml-sycl/conv.hpp +++ b/ggml/src/ggml-sycl/conv.hpp @@ -1,20 +1,20 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_CONV_HPP -#define GGML_SYCL_CONV_HPP - -#include "common.hpp" - -void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst); - -#endif // GGML_SYCL_CONV_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONV_HPP +#define GGML_SYCL_CONV_HPP + +#include "common.hpp" + +void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst); + +#endif // GGML_SYCL_CONV_HPP diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 96d2583b13b..2f96b66e422 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -1,642 +1,642 @@ -#include "convert.hpp" -#include "dequantize.hpp" -#include "presets.hpp" - -template -static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, - const sycl::nd_item<3> &item_ct1) { - const int64_t i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2)); - - if (i >= k) { - return; - } - - const int64_t ib = i/qk; // block index - const int64_t iqs = (i%qk)/qr; // quant index - const int64_t iybs = i - i%qk; // y block start index - const int64_t y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(vx, ib, iqs, v); - - y[iybs + iqs + 0] = v.x(); - y[iybs + iqs + y_offset] = v.y(); -} - -template -static void dequantize_block_sycl(const void *__restrict__ vx, - dst_t *__restrict__ y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>( - sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block(vx, y, k, item_ct1); - }); - } -} - -template -static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; -#if QK_K == 256 - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); - } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q2_K(vx, y, item_ct1); - }); - } - -#endif -} - -template -static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; -#if QK_K == 256 - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); - } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q3_K(vx, y, item_ct1); - }); - } -#endif -} - -template -static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb32 = k / 32; - const int64_t nb = (k + 255) / 256; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_0(vx, y, nb32, item_ct1); - }); - } -} - -template -static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - int constexpr WARP_K = WARP_SIZE * QK4_0; - const int n_warp = (k + WARP_K - 1) / WARP_K; - GGML_ASSERT(k % 2 == 0); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * - sycl::range<3>(1, 1, WARP_SIZE), - sycl::range<3>(1, 1, WARP_SIZE)), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{ - dequantize_block_q4_0_reorder(vx, y, k, item_ct1); - }); - -} - -template -static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb32 = k / 32; - const int64_t nb = (k + 255) / 256; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_1(vx, y, nb32, item_ct1); - }); - } -} - - -template -static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1); - }); - }); - } -} - -template -static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - const size_t local_size = 32; - const size_t global_size = nb * local_size; - - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->submit([&](sycl::handler & cgh) { - sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); - - cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)), - [=](sycl::nd_item<1> item_ct1) { - dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb); - }); - }); -} - -template -static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; -#if QK_K == 256 - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); - } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q5_K(vx, y, item_ct1); - }); - } - -#endif -} - -template -static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; -#if QK_K == 256 - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 64), - sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); - } -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q6_K(vx, y, item_ct1); - }); - } - -#endif -} - -template -static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), - [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); }); -} - -template -static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_s( - vx, y, item_ct1, iq1s_grid_gpu - ); - }); - }); - } -} - -template -static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_m( - vx, y, item_ct1, iq1s_grid_gpu - ); - }); - }); - } -} - -template -static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_xxs( - vx, y, item_ct1, iq2xxs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); - }); - } -} - -template -static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_xs( - vx, y, item_ct1, iq2xs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); - }); - } -} - -template -static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq2_s(vx, y, item_ct1); - }); - }); - } -} - - -template -static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq3_xxs( - vx, y, item_ct1, iq3xxs_grid, - ksigns_iq2xs, kmask_iq2xs); - }); - }); - } -} - -template -static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = k / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq3_s( - vx, y, item_ct1, kmask_iq2xs, iq3s_grid); - }); - }); - } -} - -template -static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = (k + QK_K - 1) / QK_K; -#if QK_K == 64 - dequantize_row_iq4_nl_sycl(vx, y, k, stream); -#else - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq4_xs(vx, y, item_ct1); - }); - }); - } -#endif -} - -template -static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k, - dpct::queue_ptr stream) { - const int64_t nb = (k + QK_K - 1) / QK_K; - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - cgh.parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq4_nl(vx, y, item_ct1); - }); - }); - } -} - -template -static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, - const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03, - const sycl::nd_item<3> & item_ct1) { - - const int64_t work_group_size = item_ct1.get_local_range(2); - const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2); - - const int64_t i01 = item_ct1.get_group(1); - const int64_t i02 = item_ct1.get_group(0) % ne02; - const int64_t i03 = item_ct1.get_group(0) / ne02; - - // make each work-item deal with more elements since sycl global range can not exceed max int - const src_t * x = static_cast(vx); - const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01; - const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00; - -#pragma unroll - for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) { - y[iy + i00] = static_cast(x[ix + i00]); - } -} - -template -static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y, - const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, - const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) { - dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 }); - - sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE)); - - // decrease global range when it exceeds the max int - // TODO: Downsample logic is separated from the kernel, a rewrite is desirable - int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE); - sycl::range<3> workgroup_size(1, 1, downsized_workgroup); - - queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) { - convert_unary_nc(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1); - }); -} - -template -static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) { - convert_unary_nc_sycl(vx, y, k, 1, 1, 1, k, k, k, queue); -} - -to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { - switch (type) { - case GGML_TYPE_Q4_0: - if (dst->src[0]->extra && - ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { - return dequantize_row_q4_0_sycl_reorder; - } else { - return dequantize_block_sycl; - } - case GGML_TYPE_Q4_1: - return dequantize_block_sycl; - case GGML_TYPE_Q5_0: - return dequantize_block_sycl; - case GGML_TYPE_Q5_1: - return dequantize_block_sycl; - case GGML_TYPE_Q8_0: - return dequantize_block_sycl; - case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_sycl; - case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_sycl; - case GGML_TYPE_Q4_K: - if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - return dequantize_row_q4_K_sycl_reorder; - } else { - return dequantize_row_q4_K_sycl; - } - case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_sycl; - case GGML_TYPE_Q6_K: - if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - return dequantize_row_q6_K_sycl_reorder; - } else { - return dequantize_row_q6_K_sycl; - } - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; - case GGML_TYPE_IQ1_M: - return dequantize_row_iq1_m_sycl; - case GGML_TYPE_IQ2_XXS: - return dequantize_row_iq2_xxs_sycl; - case GGML_TYPE_IQ2_XS: - return dequantize_row_iq2_xs_sycl; - case GGML_TYPE_IQ2_S: - return dequantize_row_iq2_s_sycl; - case GGML_TYPE_IQ3_XXS: - return dequantize_row_iq3_xxs_sycl; - case GGML_TYPE_IQ3_S: - return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ4_XS: - return dequantize_row_iq4_xs_sycl; - case GGML_TYPE_IQ4_NL: - return dequantize_row_iq4_nl_sycl; - case GGML_TYPE_F32: - return convert_unary_sycl; - default: - return nullptr; - } -} - -to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { - switch (type) { - case GGML_TYPE_Q4_0: - if (dst->src[0]->extra && - ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { - return dequantize_row_q4_0_sycl_reorder; - } else { - return dequantize_row_q4_0_sycl; - } - case GGML_TYPE_Q4_1: - return dequantize_row_q4_1_sycl; - case GGML_TYPE_Q5_0: - return dequantize_block_sycl; - case GGML_TYPE_Q5_1: - return dequantize_block_sycl; - case GGML_TYPE_Q8_0: - return dequantize_block_sycl; - case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_sycl; - case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_sycl; - case GGML_TYPE_Q4_K: - if (dst->src[0]->extra && - ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { - return dequantize_row_q4_K_sycl_reorder; - } else { - return dequantize_row_q4_K_sycl; - } - case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_sycl; - case GGML_TYPE_Q6_K: - if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - return dequantize_row_q6_K_sycl_reorder; - } else { - return dequantize_row_q6_K_sycl; - } - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; - case GGML_TYPE_IQ1_M: - return dequantize_row_iq1_m_sycl; - case GGML_TYPE_IQ2_XXS: - return dequantize_row_iq2_xxs_sycl; - case GGML_TYPE_IQ2_XS: - return dequantize_row_iq2_xs_sycl; - case GGML_TYPE_IQ2_S: - return dequantize_row_iq2_s_sycl; - case GGML_TYPE_IQ3_XXS: - return dequantize_row_iq3_xxs_sycl; - case GGML_TYPE_IQ3_S: - return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ4_XS: - return dequantize_row_iq4_xs_sycl; - case GGML_TYPE_IQ4_NL: - return dequantize_row_iq4_nl_sycl; - case GGML_TYPE_F16: - return convert_unary_sycl; - default: - return nullptr; - } -} - -to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) { - switch (type) { - case GGML_TYPE_F32: - return convert_unary_nc_sycl; - default: - return nullptr; - } -} +#include "convert.hpp" +#include "dequantize.hpp" +#include "presets.hpp" + +template +static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, + const sycl::nd_item<3> &item_ct1) { + const int64_t i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2)); + + if (i >= k) { + return; + } + + const int64_t ib = i/qk; // block index + const int64_t iqs = (i%qk)/qr; // quant index + const int64_t iybs = i - i%qk; // y block start index + const int64_t y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x(); + y[iybs + iqs + y_offset] = v.y(); +} + +template +static void dequantize_block_sycl(const void *__restrict__ vx, + dst_t *__restrict__ y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + stream->parallel_for( + sycl::nd_range<3>( + sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block(vx, y, k, item_ct1); + }); + } +} + +template +static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q2_K(vx, y, item_ct1); + }); + } + +#endif +} + +template +static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q3_K(vx, y, item_ct1); + }); + } +#endif +} + +template +static void dequantize_row_q4_0_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb32 = k / 32; + const int64_t nb = (k + 255) / 256; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_0(vx, y, nb32, item_ct1); + }); + } +} + +template +static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + int constexpr WARP_K = WARP_SIZE * QK4_0; + const int n_warp = (k + WARP_K - 1) / WARP_K; + GGML_ASSERT(k % 2 == 0); + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) * + sycl::range<3>(1, 1, WARP_SIZE), + sycl::range<3>(1, 1, WARP_SIZE)), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{ + dequantize_block_q4_0_reorder(vx, y, k, item_ct1); + }); + +} + +template +static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb32 = k / 32; + const int64_t nb = (k + 255) / 256; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_1(vx, y, nb32, item_ct1); + }); + } +} + + +template +static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q4_K(vx, y, get_pointer(scale_local_acc), item_ct1); + }); + }); + } +} + +template +static void dequantize_row_q4_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + const size_t local_size = 32; + const size_t global_size = nb * local_size; + + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->submit([&](sycl::handler & cgh) { + sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); + + cgh.parallel_for(sycl::nd_range<1>(sycl::range<1>(global_size), sycl::range<1>(local_size)), + [=](sycl::nd_item<1> item_ct1) { + dequantize_block_q4_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb); + }); + }); +} + +template +static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q5_K(vx, y, item_ct1); + }); + } + +#endif +} + +template +static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; +#if QK_K == 256 + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 64), + sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_q6_K(vx, y, item_ct1); + }); + } + +#endif +} + +template +static void dequantize_row_q6_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), + [=](sycl::nd_item<3> item_ct1) { dequantize_block_q6_K_reorder(vx, y, item_ct1, nb); }); +} + +template +static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_s( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template +static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_m( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template +static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xxs( + vx, y, item_ct1, iq2xxs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template +static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_xs( + vx, y, item_ct1, iq2xs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s(vx, y, item_ct1); + }); + }); + } +} + + +template +static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_xxs( + vx, y, item_ct1, iq3xxs_grid, + ksigns_iq2xs, kmask_iq2xs); + }); + }); + } +} + +template +static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_s( + vx, y, item_ct1, kmask_iq2xs, iq3s_grid); + }); + }); + } +} + +template +static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_row_iq4_nl_sycl(vx, y, k, stream); +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_xs(vx, y, item_ct1); + }); + }); + } +#endif +} + +template +static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int64_t k, + dpct::queue_ptr stream) { + const int64_t nb = (k + QK_K - 1) / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_nl(vx, y, item_ct1); + }); + }); + } +} + +template +static void convert_unary_nc(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t ne00, const int64_t ne01, + const int64_t ne02, const int64_t s01, const int64_t s02, const int64_t s03, + const sycl::nd_item<3> & item_ct1) { + + const int64_t work_group_size = item_ct1.get_local_range(2); + const int64_t global_id = item_ct1.get_local_id(2) + work_group_size * item_ct1.get_group(2); + + const int64_t i01 = item_ct1.get_group(1); + const int64_t i02 = item_ct1.get_group(0) % ne02; + const int64_t i03 = item_ct1.get_group(0) / ne02; + + // make each work-item deal with more elements since sycl global range can not exceed max int + const src_t * x = static_cast(vx); + const int64_t ix = i03 * s03 + i02 * s02 + i01 * s01; + const int64_t iy = ((i03 * ne02 + i02) * ne01 + i01) * ne00; + +#pragma unroll + for (int64_t i00 = global_id; i00 < ne00; i00 += work_group_size * item_ct1.get_group_range(2)) { + y[iy + i00] = static_cast(x[ix + i00]); + } +} + +template +static void convert_unary_nc_sycl(const void * __restrict__ vx, dst_t * __restrict__ y, + const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03, + const int64_t s01, const int64_t s02, const int64_t s03, dpct::queue_ptr queue) { + dpct::has_capability_or_fail(queue->get_device(), { sycl::aspect::fp16 }); + + sycl::range<3> global_size(ne02 * ne03, ne01, ceil_div(ne00, SYCL_DEQUANTIZE_BLOCK_SIZE)); + + // decrease global range when it exceeds the max int + // TODO: Downsample logic is separated from the kernel, a rewrite is desirable + int64_t downsized_workgroup = downsample_sycl_global_range(global_size[0], SYCL_DEQUANTIZE_BLOCK_SIZE); + sycl::range<3> workgroup_size(1, 1, downsized_workgroup); + + queue->parallel_for(sycl::nd_range<3>(global_size * workgroup_size, workgroup_size), [=](sycl::nd_item<3> item_ct1) { + convert_unary_nc(vx, y, ne00, ne01, ne02, s01, s02, s03, item_ct1); + }); +} + +template +static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr queue) { + convert_unary_nc_sycl(vx, y, k, 1, 1, 1, k, k, k, queue); +} + +to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { + switch (type) { + case GGML_TYPE_Q4_0: + if (dst->src[0]->extra && + ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q4_0_sycl_reorder; + } else { + return dequantize_block_sycl; + } + case GGML_TYPE_Q4_1: + return dequantize_block_sycl; + case GGML_TYPE_Q5_0: + return dequantize_block_sycl; + case GGML_TYPE_Q5_1: + return dequantize_block_sycl; + case GGML_TYPE_Q8_0: + return dequantize_block_sycl; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_sycl; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_sycl; + case GGML_TYPE_Q4_K: + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q4_K_sycl_reorder; + } else { + return dequantize_row_q4_K_sycl; + } + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_sycl; + case GGML_TYPE_Q6_K: + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q6_K_sycl_reorder; + } else { + return dequantize_row_q6_K_sycl; + } + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_sycl; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; + case GGML_TYPE_F32: + return convert_unary_sycl; + default: + return nullptr; + } +} + +to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { + switch (type) { + case GGML_TYPE_Q4_0: + if (dst->src[0]->extra && + ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q4_0_sycl_reorder; + } else { + return dequantize_row_q4_0_sycl; + } + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_sycl; + case GGML_TYPE_Q5_0: + return dequantize_block_sycl; + case GGML_TYPE_Q5_1: + return dequantize_block_sycl; + case GGML_TYPE_Q8_0: + return dequantize_block_sycl; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_sycl; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_sycl; + case GGML_TYPE_Q4_K: + if (dst->src[0]->extra && + ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q4_K_sycl_reorder; + } else { + return dequantize_row_q4_K_sycl; + } + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_sycl; + case GGML_TYPE_Q6_K: + if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + return dequantize_row_q6_K_sycl_reorder; + } else { + return dequantize_row_q6_K_sycl; + } + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_sycl; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; + case GGML_TYPE_F16: + return convert_unary_sycl; + default: + return nullptr; + } +} + +to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return convert_unary_nc_sycl; + default: + return nullptr; + } +} diff --git a/ggml/src/ggml-sycl/convert.hpp b/ggml/src/ggml-sycl/convert.hpp index f8cb573e368..6aac9b39447 100644 --- a/ggml/src/ggml-sycl/convert.hpp +++ b/ggml/src/ggml-sycl/convert.hpp @@ -1,34 +1,34 @@ -// -// MIT license -// Copyright (C) 2025 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_CONVERT_HPP -#define GGML_SYCL_CONVERT_HPP - -#include "common.hpp" - -template -using to_t_sycl_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, dpct::queue_ptr stream); -typedef to_t_sycl_t to_fp32_sycl_t; -typedef to_t_sycl_t to_fp16_sycl_t; - -to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst); -to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst); - -// Nc = Non-contiguous -template -using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, - int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue); - -typedef to_t_nc_sycl_t to_fp16_nc_sycl_t; -to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type); - -#endif // GGML_SYCL_CONVERT_HPP +// +// MIT license +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_CONVERT_HPP +#define GGML_SYCL_CONVERT_HPP + +#include "common.hpp" + +template +using to_t_sycl_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, dpct::queue_ptr stream); +typedef to_t_sycl_t to_fp32_sycl_t; +typedef to_t_sycl_t to_fp16_sycl_t; + +to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst); +to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor * dst); + +// Nc = Non-contiguous +template +using to_t_nc_sycl_t = void (*)(const void * x, T * y, int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, + int64_t s01, int64_t s02, int64_t s03, dpct::queue_ptr queue); + +typedef to_t_nc_sycl_t to_fp16_nc_sycl_t; +to_fp16_nc_sycl_t get_to_fp16_nc_sycl(ggml_type type); + +#endif // GGML_SYCL_CONVERT_HPP diff --git a/ggml/src/ggml-sycl/count-equal.cpp b/ggml/src/ggml-sycl/count-equal.cpp index b0a8b4820de..ac110172b62 100644 --- a/ggml/src/ggml-sycl/count-equal.cpp +++ b/ggml/src/ggml-sycl/count-equal.cpp @@ -1,79 +1,79 @@ -#include "count-equal.hpp" - -#include - -template -static void count_equal(const T *__restrict__ x, const T *__restrict__ y, - int64_t *__restrict__ dst, const int64_t dk, - const int64_t k) { - auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); - const int64_t i0 = (int64_t)item_ct1.get_group(2) * dk; - const int64_t i1 = sycl::min(i0 + dk, k); - - int nequal = 0; - - for (int64_t i = i0 + item_ct1.get_local_id(2); i < i1; i += WARP_SIZE) { - const T xi = x[i]; - const T yi = y[i]; - nequal += xi == yi; - } - - nequal = warp_reduce_sum(nequal); - - if (item_ct1.get_local_id(2) != 0) { - return; - } - - dpct::atomic_fetch_add( - (int *)dst, nequal); -} - -void ggml_sycl_count_equal(ggml_backend_sycl_context &ctx, ggml_tensor *dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - - GGML_ASSERT(src0->type == src1->type); - GGML_ASSERT( dst->type == GGML_TYPE_I64); - - GGML_ASSERT(ggml_are_same_shape(src0, src1)); - GGML_ASSERT(ggml_is_contiguous(src0)); - GGML_ASSERT(ggml_is_contiguous(src1)); - GGML_ASSERT(ggml_is_contiguous(dst)); - - int64_t * dst_d = (int64_t *) dst->data; - - dpct::queue_ptr stream = ctx.stream(); - const int id = get_current_device_id(); - const int nsm = ggml_sycl_info().devices[id].nsm; - - const int64_t ne = ggml_nelements(src0); - GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int"); - const int64_t dne = - GGML_PAD((ne + 4 * nsm - 1) / (4 * nsm), SYCL_COUNT_EQUAL_CHUNK_SIZE); - - SYCL_CHECK(CHECK_TRY_ERROR(stream->memset(dst_d, 0, ggml_nbytes(dst)))); - - const dpct::dim3 block_dims(WARP_SIZE, 1, 1); - const dpct::dim3 block_nums( - std::min((int64_t)4 * nsm, (ne + SYCL_COUNT_EQUAL_CHUNK_SIZE - 1) / - SYCL_COUNT_EQUAL_CHUNK_SIZE), - 1, 1); - - switch (src0->type) { - case GGML_TYPE_I32: { - const int *src0_d = (const int *)src0->data; - const int *src1_d = (const int *)src1->data; - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - count_equal(src0_d, src1_d, dst_d, dne, ne); - GGML_UNUSED(item_ct1); - }); - - } break; - default: - GGML_ASSERT(false); - break; - } -} +#include "count-equal.hpp" + +#include + +template +static void count_equal(const T *__restrict__ x, const T *__restrict__ y, + int64_t *__restrict__ dst, const int64_t dk, + const int64_t k) { + auto item_ct1 = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); + const int64_t i0 = (int64_t)item_ct1.get_group(2) * dk; + const int64_t i1 = sycl::min(i0 + dk, k); + + int nequal = 0; + + for (int64_t i = i0 + item_ct1.get_local_id(2); i < i1; i += WARP_SIZE) { + const T xi = x[i]; + const T yi = y[i]; + nequal += xi == yi; + } + + nequal = warp_reduce_sum(nequal); + + if (item_ct1.get_local_id(2) != 0) { + return; + } + + dpct::atomic_fetch_add( + (int *)dst, nequal); +} + +void ggml_sycl_count_equal(ggml_backend_sycl_context &ctx, ggml_tensor *dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == src1->type); + GGML_ASSERT( dst->type == GGML_TYPE_I64); + + GGML_ASSERT(ggml_are_same_shape(src0, src1)); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + int64_t * dst_d = (int64_t *) dst->data; + + dpct::queue_ptr stream = ctx.stream(); + const int id = get_current_device_id(); + const int nsm = ggml_sycl_info().devices[id].nsm; + + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int"); + const int64_t dne = + GGML_PAD((ne + 4 * nsm - 1) / (4 * nsm), SYCL_COUNT_EQUAL_CHUNK_SIZE); + + SYCL_CHECK(CHECK_TRY_ERROR(stream->memset(dst_d, 0, ggml_nbytes(dst)))); + + const dpct::dim3 block_dims(WARP_SIZE, 1, 1); + const dpct::dim3 block_nums( + std::min((int64_t)4 * nsm, (ne + SYCL_COUNT_EQUAL_CHUNK_SIZE - 1) / + SYCL_COUNT_EQUAL_CHUNK_SIZE), + 1, 1); + + switch (src0->type) { + case GGML_TYPE_I32: { + const int *src0_d = (const int *)src0->data; + const int *src1_d = (const int *)src1->data; + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + count_equal(src0_d, src1_d, dst_d, dne, ne); + GGML_UNUSED(item_ct1); + }); + + } break; + default: + GGML_ASSERT(false); + break; + } +} diff --git a/ggml/src/ggml-sycl/count-equal.hpp b/ggml/src/ggml-sycl/count-equal.hpp index f7f4fcbd0ba..c61f486c1b6 100644 --- a/ggml/src/ggml-sycl/count-equal.hpp +++ b/ggml/src/ggml-sycl/count-equal.hpp @@ -1,9 +1,9 @@ -#ifndef GGML_SYCL_COUNT_EQUAL_HPP -#define GGML_SYCL_COUNT_EQUAL_HPP -#include "common.hpp" - -#define SYCL_COUNT_EQUAL_CHUNK_SIZE 128 - -void ggml_sycl_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -#endif //GGML_SYCL_COUNT_EQUAL_HPP +#ifndef GGML_SYCL_COUNT_EQUAL_HPP +#define GGML_SYCL_COUNT_EQUAL_HPP +#include "common.hpp" + +#define SYCL_COUNT_EQUAL_CHUNK_SIZE 128 + +void ggml_sycl_count_equal(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif //GGML_SYCL_COUNT_EQUAL_HPP diff --git a/ggml/src/ggml-sycl/cpy.cpp b/ggml/src/ggml-sycl/cpy.cpp index 1ec99b0a5d1..2ce1970cef2 100644 --- a/ggml/src/ggml-sycl/cpy.cpp +++ b/ggml/src/ggml-sycl/cpy.cpp @@ -1,605 +1,605 @@ -#include "cpy.hpp" - -#include - -#include "dequantize.hpp" -#include "ggml-sycl/common.hpp" -#include "ggml-sycl/presets.hpp" -#include "ggml.h" - - -static void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - sycl::half * dsti = (sycl::half *) cdsti; - - *dsti = sycl::vec(*xi).convert()[0]; -} - -static void cpy_1_f16_f16(const char * cxi, char * cdsti) { - const sycl::half * xi = (const sycl::half *) cxi; - sycl::half * dsti = (sycl::half *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_f16_f32(const char * cxi, char * cdsti) { - const sycl::half * xi = (const sycl::half *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_i16_i16(const char * cxi, char * cdsti) { - const int16_t * xi = (const int16_t *) cxi; - int16_t * dsti = (int16_t *) cdsti; - - *dsti = *xi; -} - -static void cpy_1_i32_i32(const char * cxi, char * cdsti) { - const int32_t * xi = (const int32_t *) cxi; - int32_t * dsti = (int32_t *) cdsti; - - *dsti = *xi; -} - -template -static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, - const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, - const sycl::nd_item<3> & item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); - - if (i >= ne) { - return; - } - - // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor - // then combine those indices with the corresponding byte offsets to get the total offsets - const int i03 = i / (ne00 * ne01 * ne02); - const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); - const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; - const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; - const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; - - const int i13 = i / (ne10 * ne11 * ne12); - const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); - const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; - const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; - const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; - - cpy_1(cx + x_offset, cdst + dst_offset); -} - - -/* quantized type same copy */ -template -static void cpy_blck_q_q(const char * cxi, char * cdsti) { - const T * xi = (const T *) cxi; - T * dsti = (T *) cdsti; - *dsti = *xi; -} - - -static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { - float * cdstf = (float *) (cdsti); - - for (int j = 0; j < QK8_0; j += 2) { - dfloat2 dq; - dequantize_q8_0(cxi, 0, j, dq); - *(cdstf + j) = dq.x(); - *(cdstf + j + 1) = dq.y(); - } -} - - - -template static void cpy_blck_q_f32(const char * cxi, char * cdsti) { - float * cdstf = (float *) (cdsti); - - for (int j = 0; j < qk / 2; j++) { - dfloat2 dq; - dequant(cxi, 0, j, dq); - *(cdstf + j) = dq.x(); - *(cdstf + j + qk / 2) = dq.y(); - } -} - - -template -static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, - const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, - const sycl::nd_item<3> & item_ct1) { - const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; - - if (i >= ne) { - return; - } - - const int i03 = i / (ne00 * ne01 * ne02); - const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); - const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; - const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; - const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; - - - const int i13 = i / (ne10 * ne11 * ne12); - const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); - const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; - const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; - const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; - - cpy_blck_q_q(cx + x_offset, cdst + dst_offset); -} - -template -static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, - const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, - const sycl::nd_item<3> & item_ct1) { - const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; - - if (i >= ne) { - return; - } - - - const int i03 = i / (ne00 * ne01 * ne02); - const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); - const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; - const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; - const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; - - const int i13 = i / (ne10 * ne11 * ne12); - const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); - const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; - const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; - const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; - - cpy_blck(cx + x_offset, cdst + dst_offset); -} - -template -static void cpy_q_f32(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, - const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, - const sycl::nd_item<3> & item_ct1) { - const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; - - if (i >= ne) { - return; - } - - const int i03 = i / (ne00 * ne01 * ne02); - const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); - const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; - const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; - const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; - - const int i13 = i / (ne10 * ne11 * ne12); - const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); - const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; - const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; - const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; - - cpy_blck(cx + x_offset, cdst + dst_offset); -} - -static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - GGML_ASSERT(ne % QK8_0 == 0); - const int num_blocks = ne / QK8_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ne; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_q_f32(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - GGML_ASSERT(ne % QK4_0 == 0); - const int num_blocks = ne / QK4_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_f32, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - GGML_ASSERT(ne % QK4_1 == 0); - const int num_blocks = ne / QK4_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_f32, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - GGML_ASSERT(ne % QK5_0 == 0); - const int num_blocks = ne / QK5_0; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_f32, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - GGML_ASSERT(ne % QK5_1 == 0); - const int num_blocks = ne / QK5_1; - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ne; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_f32, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, - nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, - item_ct1); - }); -} - -static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - GGML_ASSERT(ne % QK4_NL == 0); - const int num_blocks = ne / QK4_NL; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { - cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, - ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; - { - // dpct::has_capability_or_fail(stream->get_device(), - // {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, item_ct1); - }); - } -} - -static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - - -static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - - -static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); - - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - - -static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - - -static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, - const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, - const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13, queue_ptr stream) { - - const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { - cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); - }); -} - -void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { - // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field - scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0)); - const int64_t ne = ggml_nelements(src0); - GGML_ASSERT(ne == ggml_nelements(src1)); - - GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); - GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - - GGML_TENSOR_BINARY_OP_LOCALS01; - - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - queue_ptr main_stream = ctx.stream(); - - char * src0_ddc = (char *) src0->data; - char * src1_ddc = (char *) src1->data; - if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) { - GGML_SYCL_DEBUG("%s: memcpy path\n", __func__); - main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0)); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { - ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) { - ggml_cpy_q4_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) { - ggml_cpy_q4_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) { - ggml_cpy_q8_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { - ggml_cpy_f32_q5_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) { - ggml_cpy_q5_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { - ggml_cpy_f32_q5_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) { - ggml_cpy_q5_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, - nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { - ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, - nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) { - ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) { - ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else { - GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), - ggml_type_name(src1->type)); - GGML_ABORT("fatal error"); - } -} catch (const sycl::exception & exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_cpy(ctx, dst->src[0], dst); -} +#include "cpy.hpp" + +#include + +#include "dequantize.hpp" +#include "ggml-sycl/common.hpp" +#include "ggml-sycl/presets.hpp" +#include "ggml.h" + + +static void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = sycl::vec(*xi).convert()[0]; +} + +static void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + sycl::half * dsti = (sycl::half *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const sycl::half * xi = (const sycl::half *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i16_i16(const char * cxi, char * cdsti) { + const int16_t * xi = (const int16_t *) cxi; + int16_t * dsti = (int16_t *) cdsti; + + *dsti = *xi; +} + +static void cpy_1_i32_i32(const char * cxi, char * cdsti) { + const int32_t * xi = (const int32_t *) cxi; + int32_t * dsti = (int32_t *) cdsti; + + *dsti = *xi; +} + +template +static void cpy_f32_f16(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2); + + if (i >= ne) { + return; + } + + // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + + +/* quantized type same copy */ +template +static void cpy_blck_q_q(const char * cxi, char * cdsti) { + const T * xi = (const T *) cxi; + T * dsti = (T *) cdsti; + *dsti = *xi; +} + + +static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) { + float * cdstf = (float *) (cdsti); + + for (int j = 0; j < QK8_0; j += 2) { + dfloat2 dq; + dequantize_q8_0(cxi, 0, j, dq); + *(cdstf + j) = dq.x(); + *(cdstf + j + 1) = dq.y(); + } +} + + + +template static void cpy_blck_q_f32(const char * cxi, char * cdsti) { + float * cdstf = (float *) (cdsti); + + for (int j = 0; j < qk / 2; j++) { + dfloat2 dq; + dequant(cxi, 0, j, dq); + *(cdstf + j) = dq.x(); + *(cdstf + j + qk / 2) = dq.y(); + } +} + + +template +static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck_q_q(cx + x_offset, cdst + dst_offset); +} + +template +static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = i00 * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +template +static void cpy_q_f32(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02, + const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, + const sycl::nd_item<3> & item_ct1) { + const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk; + + if (i >= ne) { + return; + } + + const int i03 = i / (ne00 * ne01 * ne02); + const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01); + const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00; + const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00; + const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03; + + const int i13 = i / (ne10 * ne11 * ne12); + const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11); + const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10; + const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10; + const int dst_offset = i10 * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static void ggml_cpy_f16_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_f32_q8_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_q8_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ne; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_q_f32(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f32_q4_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_q4_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_f32, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); + }); +} + +static void ggml_cpy_f32_q4_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_q4_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_f32, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); + }); +} + +static void ggml_cpy_f32_q5_0_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK5_0 == 0); + const int num_blocks = ne / QK5_0; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_q5_0_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_f32, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); + }); +} + +static void ggml_cpy_f32_q5_1_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK5_1 == 0); + const int num_blocks = ne / QK5_1; + stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, + ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_q5_1_f32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ne; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_f32, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, + nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, + item_ct1); + }); +} + +static void ggml_cpy_f32_iq4_nl_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + GGML_ASSERT(ne % QK4_NL == 0); + const int num_blocks = ne / QK4_NL; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), [=](sycl::nd_item<3> item_ct1) { + cpy_f32_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, + ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +static void ggml_cpy_f16_f16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 }); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i16_i16_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + // dpct::has_capability_or_fail(stream->get_device(), + // {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + +static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + + +static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, + const int ne02, const int nb00, const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13, queue_ptr stream) { + + const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) { + cpy_q_q(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1); + }); +} + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try { + // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field + scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0, debug_get_tensor_str("\tsrc0", src0)); + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + GGML_TENSOR_BINARY_OP_LOCALS01; + + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + queue_ptr main_stream = ctx.stream(); + + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; + if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) { + GGML_SYCL_DEBUG("%s: memcpy path\n", __func__); + main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0)); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { + ggml_cpy_i16_i16_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) { + ggml_cpy_i32_i32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) { + ggml_cpy_q4_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) { + ggml_cpy_q4_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) { + ggml_cpy_q8_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { + ggml_cpy_f32_q5_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) { + ggml_cpy_q5_0_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { + ggml_cpy_f32_q5_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) { + ggml_cpy_q5_1_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, + nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { + ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) { + ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) { + ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else { + GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), + ggml_type_name(src1->type)); + GGML_ABORT("fatal error"); + } +} catch (const sycl::exception & exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_cpy(ctx, dst->src[0], dst); +} diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index 3c331f1ef27..a2bb7cc1cbd 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -1,223 +1,223 @@ -#ifndef GGML_SYCL_CPY_HPP -#define GGML_SYCL_CPY_HPP - -#include "common.hpp" -#include - -typedef void (*cpy_kernel_t)(const char * cx, char * cdst); - -__dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { - if (x <= val[0]) { - return 0; - } - if (x >= val[n - 1]) { - return n - 1; - } - int ml = 0, mu = n - 1; - while (mu - ml > 1) { - int mav = (ml + mu) / 2; - if (x < val[mav]) { - mu = mav; - } else { - ml = mav; - } - } - return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu; -} - -inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = sycl::fmax(amax, sycl::fabs((float) v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f / d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j] * id; - - dsti->qs[j] = sycl::round((float) x0); - } -} - -inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float) v)) { - amax = sycl::fabs((float) v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f / d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0 / 2; ++j) { - const float x0 = xi[0 + j] * id; - const float x1 = xi[QK4_0 / 2 + j] * id; - - const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - vmin = sycl::min(v, vmin); - vmax = sycl::max(v, vmax); - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f / d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = vmin; - - for (int j = 0; j < QK4_1 / 2; ++j) { - const float x0 = (xi[0 + j] - vmin) * id; - const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; - - const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); - const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_0 * dsti = (block_q5_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK5_0; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float) v)) { - amax = sycl::fabs((float) v); - vmax = v; - } - } - - const float d = vmax / -16; - const float id = d ? 1.0f / d : 0.0f; - - dsti->d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0 / 2; ++j) { - const float x0 = xi[0 + j] * id; - const float x1 = xi[QK5_0 / 2 + j] * id; - - const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f)); - const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f)); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_1 * dsti = (block_q5_1 *) cdsti; - - float min = xi[0]; - float max = xi[0]; - - for (int j = 1; j < QK5_1; ++j) { - const float v = xi[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f / d : 0.0f; - - dsti->dm.x() = d; - dsti->dm.y() = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1 / 2; ++j) { - const float x0 = (xi[0 + j] - min) * id; - const float x1 = (xi[QK5_1 / 2 + j] - min) * id; - - const uint8_t xi0 = (uint8_t) (x0 + 0.5f); - const uint8_t xi1 = (uint8_t) (x1 + 0.5f); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_iq4_nl * dsti = (block_iq4_nl *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_NL; ++j) { - const float v = xi[j]; - if (amax < sycl::fabs((float) v)) { - amax = sycl::fabs((float) v); - vmax = v; - } - } - - float d = vmax / kvalues_iq4nl[0]; - const float id = d ? 1.0f / d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL / 2; ++j) { - const float x0 = xi[0 + j] * id; - const float x1 = xi[QK4_NL / 2 + j] * id; - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); - dsti->qs[j] = xi0 | (xi1 << 4); - const float v0 = kvalues_iq4nl[xi0]; - const float v1 = kvalues_iq4nl[xi1]; - const float w0 = xi[0 + j] * xi[0 + j]; - const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j]; - sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j]; - sumq2 += w0 * v0 * v0 + w1 * v1 * v1; - } - - dsti->d = sumq2 > 0 ? sumqx / sumq2 : d; -} - -void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); -void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -#endif // GGML_SYCL_CPY_HPP +#ifndef GGML_SYCL_CPY_HPP +#define GGML_SYCL_CPY_HPP + +#include "common.hpp" +#include + +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); + +__dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) { + return 0; + } + if (x >= val[n - 1]) { + return n - 1; + } + int ml = 0, mu = n - 1; + while (mu - ml > 1) { + int mav = (ml + mu) / 2; + if (x < val[mav]) { + mu = mav; + } else { + ml = mav; + } + } + return x - val[mu - 1] < val[mu] - x ? mu - 1 : mu; +} + +inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = sycl::fmax(amax, sycl::fabs((float) v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j] * id; + + dsti->qs[j] = sycl::round((float) x0); + } +} + +inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 8.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +inline void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + vmin = sycl::min(v, vmin); + vmax = sycl::max(v, vmax); + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = vmin; + + for (int j = 0; j < QK4_1 / 2; ++j) { + const float x0 = (xi[0 + j] - vmin) * id; + const float x1 = (xi[QK4_1 / 2 + j] - vmin) * id; + + const uint8_t xi0 = dpct::min(15, (int8_t) (x0 + 0.5f)); + const uint8_t xi1 = dpct::min(15, (int8_t) (x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +inline void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q5_0 * dsti = (block_q5_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK5_0; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + const float d = vmax / -16; + const float id = d ? 1.0f / d : 0.0f; + + dsti->d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0 / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK5_0 / 2 + j] * id; + + const uint8_t xi0 = dpct::min(31, (int8_t) (x0 + 16.5f)); + const uint8_t xi1 = dpct::min(31, (int8_t) (x1 + 16.5f)); + + dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0 / 2); + } + memcpy(dsti->qh, &qh, sizeof(qh)); +} + +inline void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q5_1 * dsti = (block_q5_1 *) cdsti; + + float min = xi[0]; + float max = xi[0]; + + for (int j = 1; j < QK5_1; ++j) { + const float v = xi[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f / d : 0.0f; + + dsti->dm.x() = d; + dsti->dm.y() = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1 / 2; ++j) { + const float x0 = (xi[0 + j] - min) * id; + const float x1 = (xi[QK5_1 / 2 + j] - min) * id; + + const uint8_t xi0 = (uint8_t) (x0 + 0.5f); + const uint8_t xi1 = (uint8_t) (x1 + 0.5f); + + dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1 / 2); + } + memcpy(dsti->qh, &qh, sizeof(qh)); +} + +inline void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_iq4_nl * dsti = (block_iq4_nl *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_NL; ++j) { + const float v = xi[j]; + if (amax < sycl::fabs((float) v)) { + amax = sycl::fabs((float) v); + vmax = v; + } + } + + float d = vmax / kvalues_iq4nl[0]; + const float id = d ? 1.0f / d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL / 2; ++j) { + const float x0 = xi[0 + j] * id; + const float x1 = xi[QK4_NL / 2 + j] * id; + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); + dsti->qs[j] = xi0 | (xi1 << 4); + const float v0 = kvalues_iq4nl[xi0]; + const float v1 = kvalues_iq4nl[xi1]; + const float w0 = xi[0 + j] * xi[0 + j]; + const float w1 = xi[QK4_NL / 2 + j] * xi[QK4_NL / 2 + j]; + sumqx += w0 * v0 * xi[j] + w1 * v1 * xi[QK4_NL / 2 + j]; + sumq2 += w0 * v0 * v0 + w1 * v1 * v1; + } + + dsti->d = sumq2 > 0 ? sumqx / sumq2 : d; +} + +void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1); +void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CPY_HPP diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index 540539bb223..49bf7b7ffbf 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -1,823 +1,823 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_DEQUANTIZE_HPP -#define GGML_SYCL_DEQUANTIZE_HPP - -#include "common.hpp" - -typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); -typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs, - const int iqs, dfloat2 &v); - -static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib, - const int iqs, dfloat2 &v) { - const block_q4_0 * x = (const block_q4_0 *) vx; - - const dfloat d = x[ib].d; - - const int vui = x[ib].qs[iqs]; - - v.x() = vui & 0xF; - v.y() = vui >> 4; - -#ifdef GGML_SYCL_F16 - // v = v - {8.0f, 8.0f}; - // v = v * {d, d}; - v.s0() = (v.s0() - 8.0f) * d; - v.s1() = (v.s1() - 8.0f) * d; - -#else - v.x() = (v.x() - 8.0f) * d; - v.y() = (v.y() - 8.0f) * d; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs, - const int iqs, dfloat2 &v) { - // const block_q4_0 * x = (const block_q4_0 *) vx; - - const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib); - - const int vui = *((const uint8_t *)qs+iqs); - - v.x() = vui & 0xF; - v.y() = vui >> 4; - -#ifdef GGML_SYCL_F16 - // v = v - {8.0f, 8.0f}; - // v = v * {d, d}; - v.s0() = (v.s0() - 8.0f) * d; - v.s1() = (v.s1() - 8.0f) * d; - -#else - v.x() = (v.x() - 8.0f) * d; - v.y() = (v.y() - 8.0f) * d; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, - const int iqs, dfloat2 &v) { - const block_q4_1 * x = (const block_q4_1 *) vx; - - const dfloat d = x[ib].dm[0]; - const dfloat m = x[ib].dm[1]; - - const int vui = x[ib].qs[iqs]; - - v.x() = vui & 0xF; - v.y() = vui >> 4; - -#ifdef GGML_SYCL_F16 - // v = v * {d, d}; - // v = v + {m, m}; - v.s0() = sycl::fma(v.s0(), d, m); - v.s1() = sycl::fma(v.s1(), d, m); - -#else - v.x() = sycl::fma(v.x(), d, m); - v.y() = sycl::fma(v.y(), d, m); -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib, - const int iqs, dfloat2 &v) { - const block_q5_0 * x = (const block_q5_0 *) vx; - - const dfloat d = x[ib].d; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_SYCL_F16 - // v = v - {16.0f, 16.0f}; - // v = v * {d, d}; - v.s0() = (v.s0() - 16.0f) * d; - v.s1() = (v.s1() - 16.0f) * d; - -#else - v.x() = (v.x() - 16.0f) * d; - v.y() = (v.y() - 16.0f) * d; -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib, - const int iqs, dfloat2 &v) { - const block_q5_1 * x = (const block_q5_1 *) vx; - - const dfloat d = x[ib].dm[0]; - const dfloat m = x[ib].dm[1]; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_SYCL_F16 - // v = v * {d, d}; - // v = v + {m, m}; - v.s0() = sycl::fma(v.s0(), d, m); - v.s1() = sycl::fma(v.s1(), d, m); -#else - v.x() = sycl::fma(v.x(), d, m); - v.y() = sycl::fma(v.y(), d, m); -#endif // GGML_SYCL_F16 -} - -static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib, - const int iqs, dfloat2 &v) { - const block_q8_0 * x = (const block_q8_0 *) vx; - - const dfloat d = x[ib].d; - - v.x() = x[ib].qs[iqs + 0]; - v.y() = x[ib].qs[iqs + 1]; - -#ifdef GGML_SYCL_F16 - // v = v * {d, d}; - v.s0() *= d; - v.s1() *= d; -#else - v.x() *= d; - v.y() *= d; -#endif // GGML_SYCL_F16 -} - -template -static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - - // assume 32 threads - const int64_t tid = item_ct1.get_local_id(2); - const int64_t il = tid/8; - const int64_t ir = tid%8; - const int64_t ib = 8*i + ir; - if (ib >= nb32) { - return; - } - - dst_t * y = yy + 256*i + 32*ir + 4*il; - - const block_q4_0 * x = (const block_q4_0 *)vx + ib; - const float d = sycl::vec(x->d) - .convert()[0]; - const float dm = -8*d; - - const uint8_t * q = x->qs + 4*il; - - for (int l = 0; l < 4; ++l) { - y[l+ 0] = d * (q[l] & 0xF) + dm; - y[l+16] = d * (q[l] >> 4) + dm; - } -} - -template -static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - auto k=nb32; - // assume 32 threads - const int64_t tid = item_ct1.get_local_id(2); - const int lane_ib = i * WARP_SIZE + tid; - - if (lane_ib >= k / QK4_0) { - return; - } - - dst_t * y_ptr = yy + lane_ib * QK4_0; - - auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2; - auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib; - - const float d = float(*s_ptr); - -#pragma unroll - for (int l = 0; l < QK4_0 / 2; ++l) { - int vq = qs[l]; - y_ptr[l + 0] = d * ((vq & 0xF) - 8); - y_ptr[l + 16] = d * ((vq >> 4) - 8); - } - -} - -template -static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - - // assume 32 threads - const int64_t tid = item_ct1.get_local_id(2); - const int64_t il = tid/8; - const int64_t ir = tid%8; - const int64_t ib = 8*i + ir; - if (ib >= nb32) { - return; - } - - dst_t * y = yy + 256*i + 32*ir + 4*il; - - const block_q4_1 * x = (const block_q4_1 *)vx + ib; - const sycl::float2 d = - x->dm.convert(); - - const uint8_t * q = x->qs + 4*il; - - for (int l = 0; l < 4; ++l) { - y[l + 0] = d.x() * (q[l] & 0xF) + d.y(); - y[l + 16] = d.x() * (q[l] >> 4) + d.y(); - } -} - - -//================================== k-quants - -template -static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - const block_q2_K * x = (const block_q2_K *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t n = tid/32; - const int64_t l = tid - 32*n; - const int64_t is = 8*n + l/16; - - const uint8_t q = x[i].qs[32*n + l]; - dst_t * y = yy + i*QK_K + 128*n; - - float dall = x[i].dm[0]; - float dmin = x[i].dm[1]; - y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); - y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); - y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); -#else - const int64_t is = tid/16; // 0 or 1 - const int64_t il = tid%16; // 0...15 - const uint8_t q = x[i].qs[il] >> (2*is); - dst_t * y = yy + i*QK_K + 16*is + il; - - float dall = x[i].dm[0]; - float dmin = x[i].dm[1]; - y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); -#endif - -} - -template -static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - const block_q3_K * x = (const block_q3_K *) vx; - -#if QK_K == 256 - const int64_t r = item_ct1.get_local_id(2) / 4; - const int64_t tid = r/2; - const int64_t is0 = r%2; - const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); - const int64_t n = tid / 4; - const int64_t j = tid - 4*n; - - uint8_t m = 1 << (4*n + j); - int64_t is = 8*n + 2*j + is0; - int shift = 2*j; - - int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : - is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : - is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : - (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); - float d_all = x[i].d; - float dl = d_all * (us - 32); - - dst_t * y = yy + i*QK_K + 128*n + 32*j; - const uint8_t * q = x[i].qs + 32*n; - const uint8_t * hm = x[i].hmask; - - for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -#else - const int64_t tid = item_ct1.get_local_id(2); - const int64_t is = tid/16; // 0 or 1 - const int64_t il = tid%16; // 0...15 - const int64_t im = il/8; // 0...1 - const int64_t in = il%8; // 0...7 - - dst_t * y = yy + i*QK_K + 16*is + il; - - const uint8_t q = x[i].qs[il] >> (2*is); - const uint8_t h = x[i].hmask[in] >> (2*is + im); - const float d = (float)x[i].d; - - if (is == 0) { - y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } else { - y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } -#endif - -} - -#if QK_K == 256 -static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { - if (j < 4) { - d = q[j] & 63; - m = q[j + 4] & 63; - } else { - d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - } -} -#endif - -template -inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall, - const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) { - const int is = 2 * il; - constexpr int n = 4; - - uint8_t sc, m; - get_scale_min_k4(is + 0, scales_local, sc, m); - const float d1 = dall * sc; - const float m1 = dmin * m; - - get_scale_min_k4(is + 1, scales_local, sc, m); - const float d2 = dall * sc; - const float m2 = dmin * m; - - sycl::vec q_vec = vec_aligned_load(qs_ptr + 32 * il + n * ir); - for (int l = 0; l < n; ++l) { - y[l + 0] = d1 * (q_vec[l] & 0xF) - m1; - y[l + 32] = d2 * (q_vec[l] >> 4) - m2; - } -} - -template -static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) { - const block_q4_K * x = (const block_q4_K *) vx; - - const int64_t i = item_ct1.get_group(2); - -#if QK_K == 256 - const int64_t tid = item_ct1.get_local_id(2); - const int64_t il = tid / 8; - const int64_t ir = tid % 8; - - dst_t * y = yy + i * QK_K + 64 * il + 4 * ir; - - const sycl::half2 dm = x[i].dm; - const float dall = dm[0]; - const float dmin = dm[1]; - - if (tid < 12) { - scales_local[tid] = x[i].scales[tid]; - } - - item_ct1.barrier(sycl::access::fence_space::local_space); - dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir); -#else - const int64_t tid = item_ct1.get_local_id(2); - const uint8_t * q = x[i].qs; - dst_t * y = yy + i*QK_K; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); - y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); -#endif -} - -template -static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local, - const sycl::nd_item<1> & item_ct1, int64_t nb) { - const int64_t i = item_ct1.get_group(0); // block index - const int64_t tid = item_ct1.get_local_id(0); // thread index within block - const int64_t il = tid / 8; - const int64_t ir = tid % 8; - - dst_t * y = yy + i * QK_K + 64 * il + 4 * ir; - - const uint8_t * base = static_cast(vx); - const size_t qs_offset = i * (QK_K / 2); - const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE; - const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2); - - const uint8_t * qs_ptr = base + qs_offset; - const uint8_t * scales_ptr = base + scales_offset; - ggml_half2 dm_values = *reinterpret_cast(base + dm_offset); - - const float dall = dm_values.x(); - const float dmin = dm_values.y(); - - if (tid < 12) { - scales_local[tid] = scales_ptr[tid]; - } - - item_ct1.barrier(sycl::access::fence_space::local_space); - dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir); -} - -template -static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const block_q5_K * x = (const block_q5_K *) vx; - - const int64_t i = item_ct1.get_group(2); - -#if QK_K == 256 - // assume 64 threads - this is very slightly better than the one below - const int64_t tid = item_ct1.get_local_id(2); - const int64_t il = tid/16; // il is in 0...3 - const int64_t ir = tid%16; // ir is in 0...15 - const int64_t is = 2*il; // is is in 0...6 - - dst_t * y = yy + i*QK_K + 64*il + 2*ir; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint8_t * ql = x[i].qs + 32*il + 2*ir; - const uint8_t * qh = x[i].qh + 2*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; - - uint8_t hm = 1 << (2*il); - y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; - y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; - hm <<= 1; - y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; - y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; -#else - const int64_t tid = item_ct1.get_local_id(2); - const uint8_t q = x[i].qs[tid]; - const int64_t im = tid/8; // 0...3 - const int64_t in = tid%8; // 0...7 - const int64_t is = tid/16; // 0 or 1 - const uint8_t h = x[i].qh[in] >> im; - const float d = x[i].d; - dst_t * y = yy + i*QK_K + tid; - y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); - y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); -#endif -} - -template -static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const block_q6_K * x = (const block_q6_K *) vx; - - const int64_t i = item_ct1.get_group(2); -#if QK_K == 256 - - // assume 64 threads - this is very slightly better than the one below - const int64_t tid = item_ct1.get_local_id(2); - const int64_t ip = tid/32; // ip is 0 or 1 - const int64_t il = tid - 32*ip; // 0...32 - const int64_t is = 8*ip + il/16; - - dst_t * y = yy + i*QK_K + 128*ip + il; - - const float d = x[i].d; - - const uint8_t * ql = x[i].ql + 64*ip + il; - const uint8_t qh = x[i].qh[32*ip + il]; - const int8_t * sc = x[i].scales + is; - - y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); - y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); - y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -#else - - // assume 32 threads - const int64_t tid = item_ct1.get_local_id(2); - const int64_t ip = tid/16; // 0 or 1 - const int64_t il = tid - 16*ip; // 0...15 - - dst_t * y = yy + i*QK_K + 16*ip + il; - - const float d = x[i].d; - - const uint8_t ql = x[i].ql[16*ip + il]; - const uint8_t qh = x[i].qh[il] >> (2*ip); - const int8_t * sc = x[i].scales; - - y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); -#endif -} - -template -static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> & item_ct1, int64_t n_blocks) { - const int64_t ib = item_ct1.get_group(2); - - const int64_t tid = item_ct1.get_local_id(2); - const int64_t ip = tid / 32; // ip is 0 or 1 - const int64_t il = tid - 32 * ip; // 0...32 - const int64_t is = 8 * ip + il / 16; - - const uint8_t * base_ptr = static_cast(vx); - const auto ql_offset = ib * (QK_K / 2); - const auto qh_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * ib; - const auto base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib; - const auto base_d_offset = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks; - const uint8_t * ql_ptr = base_ptr + ql_offset; - const uint8_t * qh_ptr = base_ptr + qh_offset; - const uint8_t * scales_ptr = base_ptr + base_scales_offset; - const ggml_half * d = (const ggml_half *) (base_ptr + base_d_offset) + ib; - - dst_t * y = yy + ib * QK_K + 128 * ip + il; - - const uint8_t * ql = ql_ptr + 64 * ip + il; - const uint8_t qh = *(qh_ptr + 32 * ip + il); - const int8_t * sc = reinterpret_cast(scales_ptr + is); - - y[0] = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); - y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); - y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -} - -template -static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xxs_grid_ptr, - const uint8_t *ksigns_iq2xs_ptr, - const uint8_t *kmask_iq2xs_ptr) { - - const int64_t i = item_ct1.get_group(2); - const block_iq2_xxs * x = (const block_iq2_xxs *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; - const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); -#else - assert(false); -#endif - -} - -template -static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xs_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { - - const int64_t i = item_ct1.get_group(2); - const block_iq2_xs * x = (const block_iq2_xs *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); -#endif - -} - -template -__dpct_inline__ static void -dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - const block_iq2_s * x = (const block_iq2_s *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; -#pragma unroll - for (int j = 0; j < 8; ++j) - y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); - -#endif - -} - -template -static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { - - const int64_t i = item_ct1.get_group(2); - const block_iq3_xxs * x = (const block_iq3_xxs *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * q3 = x[i].qs + 8*ib; - const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -#else - assert(false); -#endif - -} - -template -__dpct_inline__ static void -dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { - - const int64_t i = item_ct1.get_group(2); - const block_iq3_s * x = (const block_iq3_s *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); - const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); - const uint8_t signs = x[i].signs[4*ib + il]; -#pragma unroll - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -#else - assert(false); -#endif - -} - -template -__dpct_inline__ static void -dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid_gpu) { - - const int64_t i = item_ct1.get_group(2); - const block_iq1_s * x = (const block_iq1_s *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; - const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; -#pragma unroll - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -#else - assert(false); -#endif - -} - -template -__dpct_inline__ static void -dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid_gpu) { - - const int64_t i = item_ct1.get_group(2); - const block_iq1_m * x = (const block_iq1_m *) vx; - - const int64_t tid = item_ct1.get_local_id(2); -#if QK_K == 256 - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * sc = (const uint16_t *)x[i].scales; - iq1m_scale_t scale; - scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); - const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); - const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); - const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; -#pragma unroll - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -#else - assert(false); -#endif - -} - -template -__dpct_inline__ static void -dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - - const int64_t i = item_ct1.get_group(2); - const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); - - const int64_t tid = item_ct1.get_local_id(2); - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[ib].qs + 4*il; - const float d = (float)x[ib].d; -#pragma unroll - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } - -} - - -template -__dpct_inline__ static void -dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, - const sycl::nd_item<3> &item_ct1) { - const int64_t i = item_ct1.get_group(2); - const block_iq4_xs * x = (const block_iq4_xs *)vx; - - const int64_t tid = item_ct1.get_local_id(2); - const int64_t il = tid/8; // 0...3 - const int64_t ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[i].qs + 16*ib + 4*il; - const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); -#pragma unroll - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } -} - - -#endif // GGML_SYCL_DEQUANTIZE_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DEQUANTIZE_HPP +#define GGML_SYCL_DEQUANTIZE_HPP + +#include "common.hpp" + +typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); +typedef void (*dequantize_kernel_t_reorder)(const void *d, const int64_t ib, const void *qs, + const int iqs, dfloat2 &v); + +static __dpct_inline__ void dequantize_q4_0(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v - {8.0f, 8.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 8.0f) * d; + v.s1() = (v.s1() - 8.0f) * d; + +#else + v.x() = (v.x() - 8.0f) * d; + v.y() = (v.y() - 8.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int64_t ib, const void *qs, + const int iqs, dfloat2 &v) { + // const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib); + + const int vui = *((const uint8_t *)qs+iqs); + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v - {8.0f, 8.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 8.0f) * d; + v.s1() = (v.s1() - 8.0f) * d; + +#else + v.x() = (v.x() - 8.0f) * d; + v.y() = (v.y() - 8.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + const int vui = x[ib].qs[iqs]; + + v.x() = vui & 0xF; + v.y() = vui >> 4; + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = sycl::fma(v.s0(), d, m); + v.s1() = sycl::fma(v.s1(), d, m); + +#else + v.x() = sycl::fma(v.x(), d, m); + v.y() = sycl::fma(v.y(), d, m); +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q5_0(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_SYCL_F16 + // v = v - {16.0f, 16.0f}; + // v = v * {d, d}; + v.s0() = (v.s0() - 16.0f) * d; + v.s1() = (v.s1() - 16.0f) * d; + +#else + v.x() = (v.x() - 16.0f) * d; + v.y() = (v.y() - 16.0f) * d; +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = x[ib].dm[0]; + const dfloat m = x[ib].dm[1]; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y() = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + // v = v + {m, m}; + v.s0() = sycl::fma(v.s0(), d, m); + v.s1() = sycl::fma(v.s1(), d, m); +#else + v.x() = sycl::fma(v.x(), d, m); + v.y() = sycl::fma(v.y(), d, m); +#endif // GGML_SYCL_F16 +} + +static __dpct_inline__ void dequantize_q8_0(const void *vx, const int64_t ib, + const int iqs, dfloat2 &v) { + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x() = x[ib].qs[iqs + 0]; + v.y() = x[ib].qs[iqs + 1]; + +#ifdef GGML_SYCL_F16 + // v = v * {d, d}; + v.s0() *= d; + v.s1() *= d; +#else + v.x() *= d; + v.y() *= d; +#endif // GGML_SYCL_F16 +} + +template +static void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + + // assume 32 threads + const int64_t tid = item_ct1.get_local_id(2); + const int64_t il = tid/8; + const int64_t ir = tid%8; + const int64_t ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = sycl::vec(x->d) + .convert()[0]; + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template +static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + auto k=nb32; + // assume 32 threads + const int64_t tid = item_ct1.get_local_id(2); + const int lane_ib = i * WARP_SIZE + tid; + + if (lane_ib >= k / QK4_0) { + return; + } + + dst_t * y_ptr = yy + lane_ib * QK4_0; + + auto qs = (const uint8_t*)vx + lane_ib * QK4_0 / 2; + auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k / 2) + lane_ib; + + const float d = float(*s_ptr); + +#pragma unroll + for (int l = 0; l < QK4_0 / 2; ++l) { + int vq = qs[l]; + y_ptr[l + 0] = d * ((vq & 0xF) - 8); + y_ptr[l + 16] = d * ((vq >> 4) - 8); + } + +} + +template +static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + + // assume 32 threads + const int64_t tid = item_ct1.get_local_id(2); + const int64_t il = tid/8; + const int64_t ir = tid%8; + const int64_t ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const sycl::float2 d = + x->dm.convert(); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l + 0] = d.x() * (q[l] & 0xF) + d.y(); + y[l + 16] = d.x() * (q[l] >> 4) + d.y(); + } +} + + +//================================== k-quants + +template +static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + const block_q2_K * x = (const block_q2_K *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t n = tid/32; + const int64_t l = tid - 32*n; + const int64_t is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + dst_t * y = yy + i*QK_K + 128*n; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int64_t is = tid/16; // 0 or 1 + const int64_t il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + dst_t * y = yy + i*QK_K + 16*is + il; + + float dall = x[i].dm[0]; + float dmin = x[i].dm[1]; + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +template +static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int64_t r = item_ct1.get_local_id(2) / 4; + const int64_t tid = r/2; + const int64_t is0 = r%2; + const int64_t l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4); + const int64_t n = tid / 4; + const int64_t j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int64_t is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + dst_t * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int64_t tid = item_ct1.get_local_id(2); + const int64_t is = tid/16; // 0 or 1 + const int64_t il = tid%16; // 0...15 + const int64_t im = il/8; // 0...1 + const int64_t in = il%8; // 0...7 + + dst_t * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; + m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +template +inline void dequantize_q4_K_common(dst_t * __restrict__ y, const uint8_t * __restrict__ qs_ptr, const float dall, + const float dmin, uint8_t * __restrict__ scales_local, int il, int ir) { + const int is = 2 * il; + constexpr int n = 4; + + uint8_t sc, m; + get_scale_min_k4(is + 0, scales_local, sc, m); + const float d1 = dall * sc; + const float m1 = dmin * m; + + get_scale_min_k4(is + 1, scales_local, sc, m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + sycl::vec q_vec = vec_aligned_load(qs_ptr + 32 * il + n * ir); + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q_vec[l] & 0xF) - m1; + y[l + 32] = d2 * (q_vec[l] >> 4) - m2; + } +} + +template +static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int64_t i = item_ct1.get_group(2); + +#if QK_K == 256 + const int64_t tid = item_ct1.get_local_id(2); + const int64_t il = tid / 8; + const int64_t ir = tid % 8; + + dst_t * y = yy + i * QK_K + 64 * il + 4 * ir; + + const sycl::half2 dm = x[i].dm; + const float dall = dm[0]; + const float dmin = dm[1]; + + if (tid < 12) { + scales_local[tid] = x[i].scales[tid]; + } + + item_ct1.barrier(sycl::access::fence_space::local_space); + dequantize_q4_K_common(y, x[i].qs, dall, dmin, scales_local, il, ir); +#else + const int64_t tid = item_ct1.get_local_id(2); + const uint8_t * q = x[i].qs; + dst_t * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +template +static void dequantize_block_q4_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, uint8_t * scales_local, + const sycl::nd_item<1> & item_ct1, int64_t nb) { + const int64_t i = item_ct1.get_group(0); // block index + const int64_t tid = item_ct1.get_local_id(0); // thread index within block + const int64_t il = tid / 8; + const int64_t ir = tid % 8; + + dst_t * y = yy + i * QK_K + 64 * il + 4 * ir; + + const uint8_t * base = static_cast(vx); + const size_t qs_offset = i * (QK_K / 2); + const size_t scales_offset = nb * (QK_K / 2) + i * K_SCALE_SIZE; + const size_t dm_offset = nb * (QK_K / 2) + nb * K_SCALE_SIZE + i * sizeof(ggml_half2); + + const uint8_t * qs_ptr = base + qs_offset; + const uint8_t * scales_ptr = base + scales_offset; + ggml_half2 dm_values = *reinterpret_cast(base + dm_offset); + + const float dall = dm_values.x(); + const float dmin = dm_values.y(); + + if (tid < 12) { + scales_local[tid] = scales_ptr[tid]; + } + + item_ct1.barrier(sycl::access::fence_space::local_space); + dequantize_q4_K_common(y, qs_ptr, dall, dmin, scales_local, il, ir); +} + +template +static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int64_t i = item_ct1.get_group(2); + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int64_t tid = item_ct1.get_local_id(2); + const int64_t il = tid/16; // il is in 0...3 + const int64_t ir = tid%16; // ir is in 0...15 + const int64_t is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int64_t tid = item_ct1.get_local_id(2); + const uint8_t q = x[i].qs[tid]; + const int64_t im = tid/8; // 0...3 + const int64_t in = tid%8; // 0...7 + const int64_t is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + dst_t * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +template +static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int64_t i = item_ct1.get_group(2); +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int64_t tid = item_ct1.get_local_id(2); + const int64_t ip = tid/32; // ip is 0 or 1 + const int64_t il = tid - 32*ip; // 0...32 + const int64_t is = 8*ip + il/16; + + dst_t * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int64_t tid = item_ct1.get_local_id(2); + const int64_t ip = tid/16; // 0 or 1 + const int64_t il = tid - 16*ip; // 0...15 + + dst_t * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +template +static void dequantize_block_q6_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> & item_ct1, int64_t n_blocks) { + const int64_t ib = item_ct1.get_group(2); + + const int64_t tid = item_ct1.get_local_id(2); + const int64_t ip = tid / 32; // ip is 0 or 1 + const int64_t il = tid - 32 * ip; // 0...32 + const int64_t is = 8 * ip + il / 16; + + const uint8_t * base_ptr = static_cast(vx); + const auto ql_offset = ib * (QK_K / 2); + const auto qh_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * ib; + const auto base_scales_offset = (QK_K / 2) * n_blocks + (QK_K / 4) * n_blocks + (QK_K / 16) * ib; + const auto base_d_offset = ((QK_K / 2) + (QK_K / 4) + (QK_K / 16)) * n_blocks; + const uint8_t * ql_ptr = base_ptr + ql_offset; + const uint8_t * qh_ptr = base_ptr + qh_offset; + const uint8_t * scales_ptr = base_ptr + base_scales_offset; + const ggml_half * d = (const ggml_half *) (base_ptr + base_d_offset) + ib; + + dst_t * y = yy + ib * QK_K + 128 * ip + il; + + const uint8_t * ql = ql_ptr + 64 * ip + il; + const uint8_t qh = *(qh_ptr + 32 * ip + il); + const int8_t * sc = reinterpret_cast(scales_ptr + is); + + y[0] = *d * sc[0] * ((int8_t) ((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = *d * sc[2] * ((int8_t) ((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = *d * sc[4] * ((int8_t) ((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = *d * sc[6] * ((int8_t) ((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +} + +template +static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2xxs_grid_ptr, + const uint8_t *ksigns_iq2xs_ptr, + const uint8_t *kmask_iq2xs_ptr) { + + const int64_t i = item_ct1.get_group(2); + const block_iq2_xxs * x = (const block_iq2_xxs *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * aux8 = (const uint8_t *)q2; + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid_ptr + aux8[il]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; + const uint8_t signs = ksigns_iq2xs_ptr[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs_ptr[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + +template +static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq2xs_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int64_t i = item_ct1.get_group(2); + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; +#pragma unroll + for (int j = 0; j < 8; ++j) + y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); + +#endif + +} + +template +static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq3xxs_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int64_t i = item_ct1.get_group(2); + const block_iq3_xxs * x = (const block_iq3_xxs *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * q3 = x[i].qs + 8*ib; + const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { + + const int64_t i = item_ct1.get_group(2); + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); + const uint8_t signs = x[i].signs[4*ib + il]; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int64_t i = item_ct1.get_group(2); + const block_iq1_s * x = (const block_iq1_s *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int64_t i = item_ct1.get_group(2); + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int64_t tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int64_t i = item_ct1.get_group(2); + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int64_t tid = item_ct1.get_local_id(2); + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} + + +template +__dpct_inline__ static void +dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const int64_t i = item_ct1.get_group(2); + const block_iq4_xs * x = (const block_iq4_xs *)vx; + + const int64_t tid = item_ct1.get_local_id(2); + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[i].qs + 16*ib + 4*il; + const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } +} + + +#endif // GGML_SYCL_DEQUANTIZE_HPP diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index 4f2760110c2..f2914ff7cf7 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -1,1162 +1,1162 @@ -#include "convert.hpp" -#include "dmmv.hpp" -#include "dequantize.hpp" -#include "presets.hpp" - -static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const sycl::half *x = (const sycl::half *)vx; - - // automatic half -> float type cast if dfloat == float - v.x() = x[ib + iqs + 0]; - v.y() = x[ib + iqs + 1]; -} - -static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ - const float * x = (const float *) vx; - - // automatic half -> float type cast if dfloat == float - v.x() = x[ib + iqs + 0]; - v.y() = x[ib + iqs + 1]; -} - -template -static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1) { - // qk = quantized weights per x block - // qr = number of quantized weights per data value in x block - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int tid = item_ct1.get_local_id(2); - - const int iter_stride = 2*GGML_SYCL_DMMV_X; - const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter - const int y_offset = qr == 1 ? 1 : qk/2; - -// partial sum for each thread -#ifdef GGML_SYCL_F16 - sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics -#else - float tmp = 0.0f; -#endif // GGML_SYCL_F16 - - for (int i = 0; i < ncols; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = (row*ncols + col)/qk; // x block index - const int iqs = (col%qk)/qr; // x quant index - const int iybs = col - col%qk; // y block start index - -// processing >2 values per i iter is faster for fast GPUs -#pragma unroll - for (int j = 0; j < vals_per_iter; j += 2) { - // process 2 vals per j iter - - // dequantize - // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val - dfloat2 v; - dequantize_kernel(vx, ib, iqs + j/qr, v); - - // matrix multiplication - // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_SYCL_F16 - dfloat2 t1{y[iybs + iqs + j / qr + 0], - y[iybs + iqs + j / qr + y_offset]}; - - tmp += v * t1; -#else - tmp += v.x() * y[iybs + iqs + j / qr + 0]; - tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; -#endif // GGML_SYCL_F16 - } - } - - // sum up partial sums and write back result - const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2; - for (int mask = mask_start; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { -#ifdef GGML_SYCL_F16 - dst[row] = tmp.x() + tmp.y(); -#else - dst[row] = tmp; -#endif // GGML_SYCL_F16 - } -} - -template -static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1) { - // qk = quantized weights per x block - // qr = number of quantized weights per data value in x block - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - - if (row >= nrows) { - return; - } - - const int tid = item_ct1.get_local_id(2); - - - const int ncols_left = ncols % (QK4_0*WARP_SIZE); - const int ncols_align = ncols - ncols_left; - const int iter_stride = 8*2*GGML_SYCL_DMMV_X; - const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16 - const int y_offset = qr == 1 ? 1 : qk/2; - -// partial sum for each thread -#ifdef GGML_SYCL_F16 - sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics -#else - float tmp = 0.0f; -#endif // GGML_SYCL_F16 - const char *d_ptr = (const char*)vx+ncols*nrows/2; - int i=0; - for (i = 0; i < ncols_align; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = (row*ncols + col)/qk; // x block index - const int iqs = (col%qk)/qr; // x quant index - const int iybs = col - col%qk; // y block start index - -// processing >2 values per i iter is faster for fast GPUs -#pragma unroll - for (int j = 0; j < vals_per_iter; j += 2) { - // process 2 vals per j iter - - // dequantize - // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val - dfloat2 v; - dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v); - - // matrix multiplication - // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_SYCL_F16 - dfloat2 t1{y[iybs + iqs + j / qr + 0], - y[iybs + iqs + j / qr + y_offset]}; - - tmp += v * t1; -#else - tmp += v.x() * y[iybs + iqs + j / qr + 0]; - tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; -#endif // GGML_SYCL_F16 - } - } - - for (; i < ncols; i += iter_stride) { - if (tid>=ncols_left/QK4_0) continue; - const int col = i + vals_per_iter*tid; - const int ib = (row*ncols + col)/qk; // x block index - const int iqs = (col%qk)/qr; // x quant index - const int iybs = col - col%qk; // y block start index - -// processing >2 values per i iter is faster for fast GPUs -#pragma unroll - for (int j = 0; j < vals_per_iter; j += 2) { - // process 2 vals per j iter - - // dequantize - // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val - dfloat2 v; - dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v); - - // matrix multiplication - // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_SYCL_F16 - dfloat2 t1{y[iybs + iqs + j / qr + 0], - y[iybs + iqs + j / qr + y_offset]}; - - tmp += v * t1; -#else - tmp += v.x() * y[iybs + iqs + j / qr + 0]; - tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; -#endif // GGML_SYCL_F16 - } - } - - // sum up partial sums and write back result - const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2; - for (int mask = mask_start; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { -#ifdef GGML_SYCL_F16 - dst[row] = tmp.x() + tmp.y(); -#else - dst[row] = tmp; -#endif // GGML_SYCL_F16 - } -} - -static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, - nrows, item_ct1); - }); - } -} - -/* -DPCT1110:4: The total declared local variable size in device function -dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q2_K * x = (const block_q2_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - -#if QK_K == 256 - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 16/K_QUANTS_PER_ITERATION; - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; - - uint32_t aux[4]; - const uint8_t * d = (const uint8_t *)aux; - const uint8_t * m = (const uint8_t *)(aux + 2); - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; - - } - tmp += dall * sum1 - dmin * sum2; - - } -#else - const int tid = item_ct1.get_local_id(2) / - (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2) % - (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; - - uint32_t uaux[2]; - const uint8_t * d = (const uint8_t *)uaux; - - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint32_t * s = (const uint32_t *)x[i].scales; - - uaux[0] = s[0] & 0x0f0f0f0f; - uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; - - const sycl::float2 dall = - x[i].dm.convert(); - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t ql = q[l]; - sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) - + y[l+16] * d[1] * ((ql >> 2) & 3) - + y[l+32] * d[2] * ((ql >> 4) & 3) - + y[l+48] * d[3] * ((ql >> 6) & 3); - sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; - } - tmp += dall.x() * sum1 - dall.y() * sum2; - } - -#endif - - // sum up partial sums and write back result -#pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -/* -DPCT1110:5: The total declared local variable size in device function -dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q3_K * x = (const block_q3_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - -#if QK_K == 256 - - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); - - const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; - - uint16_t utmp[4]; - const int8_t * s = (const int8_t *)utmp; - - const uint16_t s_shift = 4*im; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - const uint8_t * h = x[i].hmask + l0; - - const uint16_t * a = (const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - - const float d = x[i].d; - - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); - } - tmp += d * sum; - - } -#else - - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 - const int in = offset/8; // 0 or 1 - const int im = offset%8; // 0...7 - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint8_t * s = x[i].scales; - - const float dall = (float)x[i].d; - - float sum = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t hl = x[i].hmask[im+l] >> in; - const uint8_t ql = q[l]; - sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) - + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) - + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) - + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); - } - tmp += sum; - } -#endif - - // sum up partial sums and write back result -#pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -/* -DPCT1110:6: The total declared local variable size in device function -dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q4_K * x = (const block_q4_K *)vx + ib0; - -#if QK_K == 256 - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 - - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 - const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - -#if K_QUANTS_PER_ITERATION == 2 - uint32_t q32[4]; - const uint8_t * q4 = (const uint8_t *)q32; -#else - uint16_t q16[4]; - const uint8_t * q4 = (const uint8_t *)q16; -#endif - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - -#if K_QUANTS_PER_ITERATION == 2 - const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); - const uint32_t * q2 = q1 + 16; - - q32[0] = q1[0] & 0x0f0f0f0f; - q32[1] = q1[0] & 0xf0f0f0f0; - q32[2] = q2[0] & 0x0f0f0f0f; - q32[3] = q2[0] & 0xf0f0f0f0; - - sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 4; ++l) { - s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; - s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + - s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - - dmin * smin; -#else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#endif - - } -#else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - - const int step = tid * K_QUANTS_PER_ITERATION; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - float tmp = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const float * y = yy + i*QK_K + step; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) - + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) - + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) - + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); - } - tmp += sum; - } - -#endif - - // sum up partial sums and write back result -#pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { - dst[row] = tmp; - } -} - -/* -DPCT1110:7: The total declared local variable size in device function -dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ -static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, - const float *__restrict__ yy, - float *__restrict__ dst, - const int ncols, - const sycl::nd_item<3> &item_ct1) { - - const int row = item_ct1.get_group(2); - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q5_K * x = (const block_q5_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - -#if QK_K == 256 - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int tid = item_ct1.get_local_id(2) / 2; // 0...15 - const int ix = item_ct1.get_local_id(2) % 2; - - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 2; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - const uint8_t hm1 = 1 << (2*im); - const uint8_t hm2 = hm1 << 4; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - uint16_t q16[8]; - const uint8_t * q4 = (const uint8_t *)q16; - - for (int i = ix; i < num_blocks_per_row; i += 2) { - - const uint8_t * ql1 = x[i].qs + q_offset; - const uint8_t * qh = x[i].qh + l0; - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - const uint16_t * q1 = (const uint16_t *)ql1; - const uint16_t * q2 = q1 + 32; - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[8] & 0x0f0f; - q16[2] = (q1[0] >> 4) & 0x0f0f; - q16[3] = (q1[8] >> 4) & 0x0f0f; - q16[4] = q2[0] & 0x0f0f; - q16[5] = q2[8] & 0x0f0f; - q16[6] = (q2[0] >> 4) & 0x0f0f; - q16[7] = (q2[8] >> 4) & 0x0f0f; - for (int l = 0; l < n; ++l) { - sum.x() += - y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + - y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); - sum.y() += - y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + - y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); - sum.z() += - y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + - y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); - sum.w() += - y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + - y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); - smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] - + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; - } - tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + - sum.w() * sc[5]) - - dmin * smin; - } - -#else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; - const int im = step/8; - const int in = step%8; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const int8_t * s = x[i].scales; - const float * y = yy + i*QK_K + step; - const float d = x[i].d; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - const uint8_t h = x[i].qh[in+j] >> im; - sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) - + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) - + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) - + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); - } - tmp += sum; - } -#endif - - // sum up partial sums and write back result -#pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[row] = tmp; - } -} - -static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, - const sycl::nd_item<3> &item_ct1) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - - const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + - item_ct1.get_local_id(1); - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q6_K * x = (const block_q6_K *)vx + ib0; - -#if QK_K == 256 - - const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = - item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 - - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - -#if K_QUANTS_PER_ITERATION == 1 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; -#else - const int l0 = 4 * in; // 0, 4, 8, ..., 28 - const int is = in / 4; -#endif - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * ql = x[i].ql + ql_offset; - const uint8_t * qh = x[i].qh + qh_offset; - const int8_t * s = x[i].scales + s_offset; - - const float d = x[i].d; - -#if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; -#else - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp += sum; -#endif - - } - -#else - - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 - - const int step = tid * K_QUANTS_PER_ITERATION; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + step; - const uint8_t * ql = x[i].ql + step; - const uint8_t * qh = x[i].qh + step; - const int8_t * s = x[i].scales; - - const float d = x[i+0].d; - - float sum = 0; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) - + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) - + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) - + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); - } - tmp += sum; - - } - -#endif - - // sum up partial sums and write back result -#pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (tid == 0) { - dst[row] = tmp; - } -} - -static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec_reorder( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - - -static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); - const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - dequantize_mul_mat_vec( - vx, y, dst, ncols, nrows, item_ct1); - }); - } -} - -static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); - }); -} - -static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, - float *dst, const int ncols, - const int nrows, - dpct::queue_ptr stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { - dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); - }); -} - -void ggml_sycl_op_dequantize_mul_mat_vec( - ggml_backend_sycl_context & ctx, - const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_row_size, - const dpct::queue_ptr &stream) { - - const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - GGML_ASSERT(src1->type == GGML_TYPE_F32); - // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics -#ifdef GGML_SYCL_F16 - ggml_sycl_pool_alloc src1_dfloat_a(ctx.pool()); - sycl::half *src1_dfloat = nullptr; // dfloat == half - - bool src1_convert_f16 = - src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; - - if (src1_convert_f16) { - scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2, - " : converting src1 to fp16"); - src1_dfloat = src1_dfloat_a.alloc(ne00); - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst); - GGML_ASSERT(to_fp16_sycl != nullptr); - to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream); - } -#else - const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion -#endif // GGML_SYCL_F16 - - switch (src0->type) { - case GGML_TYPE_Q4_0: - if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && - ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { - dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - } else { - dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - } - break; - case GGML_TYPE_Q4_1: - dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_0: - dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_1: - dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q8_0: - dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_K: - if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && - ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { - // reorder is currently not supported for dmmv - GGML_ABORT("Unimplemented dequantize case case for q4_k reorder"); - } else { - dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - } - break; - case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_F16: - convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - default: - printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); - GGML_ABORT("fatal error"); - } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddq_i); - GGML_UNUSED(src1_ncols); - GGML_UNUSED(src1_padded_row_size); - GGML_UNUSED(ctx); -} +#include "convert.hpp" +#include "dmmv.hpp" +#include "dequantize.hpp" +#include "presets.hpp" + +static void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const sycl::half *x = (const sycl::half *)vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +static void convert_f32(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const float * x = (const float *) vx; + + // automatic half -> float type cast if dfloat == float + v.x() = x[ib + iqs + 0]; + v.y() = x[ib + iqs + 1]; +} + +template +static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int tid = item_ct1.get_local_id(2); + + const int iter_stride = 2*GGML_SYCL_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_SYCL_F16 + sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_SYCL_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_SYCL_F16 + dfloat2 t1{y[iybs + iqs + j / qr + 0], + y[iybs + iqs + j / qr + y_offset]}; + + tmp += v * t1; +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_SYCL_F16 + } + } + + // sum up partial sums and write back result + const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2; + for (int mask = mask_start; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { +#ifdef GGML_SYCL_F16 + dst[row] = tmp.x() + tmp.y(); +#else + dst[row] = tmp; +#endif // GGML_SYCL_F16 + } +} + +template +static void dequantize_mul_mat_vec_reorder(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int tid = item_ct1.get_local_id(2); + + + const int ncols_left = ncols % (QK4_0*WARP_SIZE); + const int ncols_align = ncols - ncols_left; + const int iter_stride = 8*2*GGML_SYCL_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter //64/16=4, 512/16/2= 16 + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_SYCL_F16 + sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_SYCL_F16 + const char *d_ptr = (const char*)vx+ncols*nrows/2; + int i=0; + for (i = 0; i < ncols_align; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_SYCL_F16 + dfloat2 t1{y[iybs + iqs + j / qr + 0], + y[iybs + iqs + j / qr + y_offset]}; + + tmp += v * t1; +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_SYCL_F16 + } + } + + for (; i < ncols; i += iter_stride) { + if (tid>=ncols_left/QK4_0) continue; + const int col = i + vals_per_iter*tid; + const int ib = (row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel_reorder((const void *)d_ptr, ib, (const void *)vx, ib * QK4_0 / 2 +iqs+j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_SYCL_F16 + dfloat2 t1{y[iybs + iqs + j / qr + 0], + y[iybs + iqs + j / qr + y_offset]}; + + tmp += v * t1; +#else + tmp += v.x() * y[iybs + iqs + j / qr + 0]; + tmp += v.y() * y[iybs + iqs + j / qr + y_offset]; +#endif // GGML_SYCL_F16 + } + } + + // sum up partial sums and write back result + const int mask_start = ncols > GGML_SYCL_DMMV_X ? WARP_SIZE >> 1 : WARP_SIZE >> 2; + for (int mask = mask_start; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { +#ifdef GGML_SYCL_F16 + dst[row] = tmp.x() + tmp.y(); +#else + dst[row] = tmp; +#endif // GGML_SYCL_F16 + } +} + +static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, + nrows, item_ct1); + }); + } +} + +/* +DPCT1110:4: The total declared local variable size in device function +dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = item_ct1.get_local_id(2) / + (2 * K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = item_ct1.get_local_id(2) % + (2 * K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const sycl::float2 dall = + x[i].dm.convert(); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x() * sum1 - dall.y() * sum2; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:5: The total declared local variable size in device function +dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:6: The total declared local variable size in device function +dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +/* +DPCT1110:7: The total declared local variable size in device function +dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register +pressure. Consult with your hardware vendor to find the total register size +available and adjust the code, or use smaller sub-group size to avoid high +register pressure. +*/ +static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = item_ct1.get_local_id(2) / 2; // 0...15 + const int ix = item_ct1.get_local_id(2) % 2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; + } + +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = + item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (tid == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q4_0_sycl_reorder(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec_reorder( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + + +static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); + }); +} + +static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); + }); +} + +void ggml_sycl_op_dequantize_mul_mat_vec( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + GGML_ASSERT(src1->type == GGML_TYPE_F32); + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_SYCL_F16 + ggml_sycl_pool_alloc src1_dfloat_a(ctx.pool()); + sycl::half *src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2, + " : converting src1 to fp16"); + src1_dfloat = src1_dfloat_a.alloc(ne00); + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_ddf_i, src1_dfloat, ne00, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_SYCL_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && + ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + dequantize_mul_mat_vec_q4_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } else { + dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && + ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + // reorder is currently not supported for dmmv + GGML_ABORT("Unimplemented dequantize case case for q4_k reorder"); + } else { + dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + } + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); + GGML_ABORT("fatal error"); + } + + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddq_i); + GGML_UNUSED(src1_ncols); + GGML_UNUSED(src1_padded_row_size); + GGML_UNUSED(ctx); +} diff --git a/ggml/src/ggml-sycl/dmmv.hpp b/ggml/src/ggml-sycl/dmmv.hpp index bd837356415..ac538f05ba4 100644 --- a/ggml/src/ggml-sycl/dmmv.hpp +++ b/ggml/src/ggml-sycl/dmmv.hpp @@ -1,27 +1,27 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_DMMV_HPP -#define GGML_SYCL_DMMV_HPP - -#include "common.hpp" - - -void ggml_sycl_op_dequantize_mul_mat_vec( - ggml_backend_sycl_context & ctx, - const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_row_size, - const dpct::queue_ptr &stream); - -#endif // GGML_SYCL_DMMV_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DMMV_HPP +#define GGML_SYCL_DMMV_HPP + +#include "common.hpp" + + +void ggml_sycl_op_dequantize_mul_mat_vec( + ggml_backend_sycl_context & ctx, + const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, + const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, + float *dst_dd_i, const int64_t row_low, const int64_t row_high, + const int64_t src1_ncols, const int64_t src1_padded_row_size, + const dpct::queue_ptr &stream); + +#endif // GGML_SYCL_DMMV_HPP diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index f93cfa701f5..dbe09d74c04 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -1,2977 +1,2977 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_DPCT_HELPER_HPP -#define GGML_SYCL_DPCT_HELPER_HPP - -#include -#include -#include -#include - -#ifdef GGML_SYCL_USE_INTEL_ONEMKL -#include -// Allow to use the same namespace for Intel oneMKL and oneMath -namespace oneapi { - namespace math = mkl; -} -#else -#include -#endif - -#include "ggml.h" - -#if defined(__linux__) -#include -#elif defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#else -#error "Only support Windows and Linux." -#endif - -#if defined(__linux__) -#include -#include -#endif -#if defined(_WIN64) -#ifndef NOMINMAX -#define NOMINMAX -#endif -#include -#endif - -#define DPCT_COMPATIBILITY_TEMP (900) - -#if defined(_MSC_VER) -#define __dpct_align__(n) __declspec(align(n)) -#define __dpct_inline__ __forceinline -#else -#define __dpct_align__(n) __attribute__((aligned(n))) -#define __dpct_inline__ __inline__ __attribute__((always_inline)) -#endif - -#if defined(_MSC_VER) -#define __dpct_noinline__ __declspec(noinline) -#else -#define __dpct_noinline__ __attribute__((noinline)) -#endif - -inline std::string get_device_type_name(const sycl::device &Device) { - auto DeviceType = Device.get_info(); - switch (DeviceType) { - case sycl::info::device_type::cpu: - return "cpu"; - case sycl::info::device_type::gpu: - return "gpu"; - case sycl::info::device_type::host: - return "host"; - case sycl::info::device_type::accelerator: - return "acc"; - default: - return "unknown"; - } -} - -inline std::string get_device_backend_and_type(const sycl::device &device) { - std::stringstream device_type; - sycl::backend backend = device.get_backend(); - device_type << backend << ":" << get_device_type_name(device); - return device_type.str(); -} - -template struct matrix_info_t { - oneapi::math::transpose transpose_info[2]; - Ts value_info[2]; - std::int64_t size_info[3]; - std::int64_t ld_info[3]; - std::int64_t groupsize_info; -}; - -inline auto get_onemath_backend(sycl::queue& queue) -#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL) - -> sycl::queue& -#endif -{ -// If the backend is known at compile-time, use oneMath backend_selector to use -// compile-time dispatching and avoid the need to dlopen libraries. Otherwise -// fallback to runtime dispatching. -#if defined(GGML_SYCL_NVIDIA) - return oneapi::math::backend_selector{ queue }; -#elif defined(GGML_SYCL_AMD) - return oneapi::math::backend_selector{ queue }; -#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL) - return queue; -#else - static_assert(false, "Unsupported backend"); -#endif -} - -namespace dpct -{ - typedef sycl::queue *queue_ptr; - typedef sycl::event *event_ptr; - typedef char *device_ptr; - typedef uint8_t byte_t; - typedef sycl::buffer buffer_t; - - /// SYCL default exception handler - inline auto exception_handler = [](sycl::exception_list exceptions) - { - for (std::exception_ptr const &e : exceptions) - { - try - { - std::rethrow_exception(e); - } - catch (sycl::exception const &e) - { - std::cerr << "Caught asynchronous SYCL exception:" << std::endl - << e.what() << std::endl - << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - } - } - }; - - enum error_code - { - success = 0, - default_error = 999 - }; - - enum memcpy_direction - { - host_to_host, - host_to_device, - device_to_host, - device_to_device, - automatic - }; - - enum memory_region - { - global = 0, // device global memory - constant, // device constant memory - local, // device local memory - shared, // memory which can be accessed by host and device - }; - - enum class library_data_t : unsigned char - { - real_float = 0, - complex_float, - real_double, - complex_double, - real_half, - complex_half, - real_bfloat16, - complex_bfloat16, - real_int4, - complex_int4, - real_uint4, - complex_uint4, - real_int8, - complex_int8, - real_uint8, - complex_uint8, - real_int16, - complex_int16, - real_uint16, - complex_uint16, - real_int32, - complex_int32, - real_uint32, - complex_uint32, - real_int64, - complex_int64, - real_uint64, - complex_uint64, - real_int8_4, - real_int8_32, - real_uint8_4, - library_data_t_size - }; - - template - struct DataType - { - using T2 = T; - }; - template - struct DataType> - { - using T2 = std::complex; - }; - - static void destroy_event(event_ptr event) - { - delete event; - } - - static inline unsigned int get_tid() - { -#if defined(__linux__) - return syscall(SYS_gettid); -#elif defined(_WIN64) - return GetCurrentThreadId(); -#else -#error "Only support Windows and Linux." -#endif - } - - namespace detail - { - static void get_version(const sycl::device &dev, int &major, int &minor) - { - // Version string has the following format: - // a. OpenCL - // b. - // c. e.g gfx1030 - std::string ver; - ver = dev.get_info(); - std::string::size_type i = 0; - while (i < ver.size()) { - if (isdigit(ver[i])) - break; - i++; - } - major = std::stoi(&(ver[i])); - while (i < ver.size()) { - if (ver[i] == '.') - break; - i++; - } - if (i < ver.size()) { - // a. and b. - i++; - minor = std::stoi(&(ver[i])); - } else { - // c. - minor = 0; - } - } - - template - class generic_error_type - { - public: - generic_error_type() = default; - generic_error_type(T value) : value{value} {} - operator T() const { return value; } - - private: - T value; - }; - - } // namespace detail - - // COPY from DPCT head files - /// dim3 is used to store 3 component dimensions. - class dim3 { - public: - unsigned x, y, z; - - constexpr dim3(unsigned x = 1, unsigned y = 1, unsigned z = 1) - : x(x), y(y), z(z) {} - - dim3(const sycl::id<3> &r) : dim3(r[2], r[1], r[0]) {} - - operator sycl::range<3>() const { return sycl::range<3>(z, y, x); } - }; // namespace dim3 - - inline dim3 operator*(const dim3 &a, const dim3 &b) { - return dim3{a.x * b.x, a.y * b.y, a.z * b.z}; - } - // COPY from DPCT head files - - - /// Pitched 2D/3D memory data. - class pitched_data - { - public: - pitched_data() : pitched_data(nullptr, 0, 0, 0) {} - pitched_data(void *data, size_t pitch, size_t x, size_t y) - : _data(data), _pitch(pitch), _x(x), _y(y) {} - - void *get_data_ptr() { return _data; } - void set_data_ptr(void *data) { _data = data; } - - size_t get_pitch() { return _pitch; } - void set_pitch(size_t pitch) { _pitch = pitch; } - - size_t get_x() { return _x; } - void set_x(size_t x) { _x = x; } - - size_t get_y() { return _y; } - void set_y(size_t y) { _y = y; } - - private: - void *_data; - size_t _pitch, _x, _y; - }; - - class device_info - { - public: - // get interface - const char *get_name() const { return _name; } - char *get_name() { return _name; } - template , - std::enable_if_t> || - std::is_same_v, - int> = 0> - auto get_max_work_item_sizes() const - { - if constexpr (std::is_same_v>) - return sycl::range<3>(_max_work_item_sizes_i[0], - _max_work_item_sizes_i[1], - _max_work_item_sizes_i[2]); - else - { - return _max_work_item_sizes_i; - } - } - template , - std::enable_if_t> || - std::is_same_v, - int> = 0> - auto get_max_work_item_sizes() - { - if constexpr (std::is_same_v>) - return sycl::range<3>(_max_work_item_sizes_i[0], - _max_work_item_sizes_i[1], - _max_work_item_sizes_i[2]); - else - { - return _max_work_item_sizes_i; - } - } - bool get_host_unified_memory() const { return _host_unified_memory; } - int get_major_version() const { return _major; } - int get_minor_version() const { return _minor; } - int get_integrated() const { return _integrated; } - int get_max_clock_frequency() const { return _frequency; } - int get_max_compute_units() const { return _max_compute_units; } - int get_max_work_group_size() const { return _max_work_group_size; } - int get_max_sub_group_size() const { return _max_sub_group_size; } - int get_max_work_items_per_compute_unit() const - { - return _max_work_items_per_compute_unit; - } - int get_max_register_size_per_work_group() const - { - return _max_register_size_per_work_group; - } - template || - std::is_same_v, - int> = 0> - auto get_max_nd_range_size() const - { - if constexpr (std::is_same_v) - return _max_nd_range_size; - else - return _max_nd_range_size_i; - } - template || - std::is_same_v, - int> = 0> - auto get_max_nd_range_size() - { - if constexpr (std::is_same_v) - return _max_nd_range_size; - else - return _max_nd_range_size_i; - } - size_t get_global_mem_size() const { return _global_mem_size; } - size_t get_local_mem_size() const { return _local_mem_size; } - size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; } - /// Returns the maximum clock rate of device's global memory in kHz. If - /// compiler does not support this API then returns default value 3200000 kHz. - unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } - /// Returns the maximum bus width between device and memory in bits. If - /// compiler does not support this API then returns default value 64 bits. - unsigned int get_memory_bus_width() const { return _memory_bus_width; } - uint32_t get_device_id() const { return _device_id; } - std::array get_uuid() const { return _uuid; } - /// Returns global memory cache size in bytes. - unsigned int get_global_mem_cache_size() const - { - return _global_mem_cache_size; - } - - // set interface - void set_name(const char *name) - { - size_t length = strlen(name); - if (length < 256) - { - std::memcpy(_name, name, length + 1); - } - else - { - std::memcpy(_name, name, 255); - _name[255] = '\0'; - } - } - void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) - { - for (int i = 0; i < 3; ++i) - _max_work_item_sizes_i[i] = max_work_item_sizes[i]; - } - [[deprecated]] void - set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) - { - for (int i = 0; i < 3; ++i) - { - _max_work_item_sizes_i[i] = max_work_item_sizes[i]; - } - } - void set_host_unified_memory(bool host_unified_memory) - { - _host_unified_memory = host_unified_memory; - } - void set_major_version(int major) { _major = major; } - void set_minor_version(int minor) { _minor = minor; } - void set_integrated(int integrated) { _integrated = integrated; } - void set_max_clock_frequency(int frequency) { _frequency = frequency; } - void set_max_compute_units(int max_compute_units) - { - _max_compute_units = max_compute_units; - } - void set_global_mem_size(size_t global_mem_size) - { - _global_mem_size = global_mem_size; - } - void set_local_mem_size(size_t local_mem_size) - { - _local_mem_size = local_mem_size; - } - void set_max_mem_alloc_size(size_t max_mem_alloc_size) - { - _max_mem_alloc_size = max_mem_alloc_size; - } - void set_max_work_group_size(int max_work_group_size) - { - _max_work_group_size = max_work_group_size; - } - void set_max_sub_group_size(int max_sub_group_size) - { - _max_sub_group_size = max_sub_group_size; - } - void - set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) - { - _max_work_items_per_compute_unit = max_work_items_per_compute_unit; - } - void set_max_nd_range_size(int max_nd_range_size[]) - { - for (int i = 0; i < 3; i++) - { - _max_nd_range_size[i] = max_nd_range_size[i]; - _max_nd_range_size_i[i] = max_nd_range_size[i]; - } - } - void set_memory_clock_rate(unsigned int memory_clock_rate) - { - _memory_clock_rate = memory_clock_rate; - } - void set_memory_bus_width(unsigned int memory_bus_width) - { - _memory_bus_width = memory_bus_width; - } - void - set_max_register_size_per_work_group(int max_register_size_per_work_group) - { - _max_register_size_per_work_group = max_register_size_per_work_group; - } - void set_device_id(uint32_t device_id) - { - _device_id = device_id; - } - void set_uuid(std::array uuid) - { - _uuid = std::move(uuid); - } - void set_global_mem_cache_size(unsigned int global_mem_cache_size) - { - _global_mem_cache_size = global_mem_cache_size; - } - - private: - char _name[256]; - int _max_work_item_sizes_i[3]; - bool _host_unified_memory = false; - int _major; - int _minor; - int _integrated = 0; - int _frequency; - // Set estimated value 3200000 kHz as default value. - unsigned int _memory_clock_rate = 3200000; - // Set estimated value 64 bits as default value. - unsigned int _memory_bus_width = 64; - unsigned int _global_mem_cache_size; - int _max_compute_units; - int _max_work_group_size; - int _max_sub_group_size; - int _max_work_items_per_compute_unit; - int _max_register_size_per_work_group; - size_t _global_mem_size; - size_t _local_mem_size; - size_t _max_mem_alloc_size; - size_t _max_nd_range_size[3]; - int _max_nd_range_size_i[3]; - uint32_t _device_id; - std::array _uuid; - }; - - static int get_major_version(const sycl::device &dev) - { - int major, minor; - detail::get_version(dev, major, minor); - return major; - } - - static int get_minor_version(const sycl::device &dev) - { - int major, minor; - detail::get_version(dev, major, minor); - return minor; - } - - static void get_device_info(device_info &out, const sycl::device &dev) - { - device_info prop; - prop.set_name(dev.get_info().c_str()); - - int major, minor; - detail::get_version(dev, major, minor); - prop.set_major_version(major); - prop.set_minor_version(minor); - - prop.set_max_work_item_sizes( -#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902) - // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes - // is an enum class element - dev.get_info()); -#else - // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by - // an int - dev.get_info>()); -#endif - prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations)); - - prop.set_max_clock_frequency( - dev.get_info() * 1000); - - prop.set_max_compute_units( - dev.get_info()); - prop.set_max_work_group_size( - dev.get_info()); - prop.set_global_mem_size(dev.get_info()); - prop.set_local_mem_size(dev.get_info()); - prop.set_max_mem_alloc_size(dev.get_info()); - -#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) - if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) - { - unsigned int tmp = - dev.get_info(); - if (tmp != 0) - prop.set_memory_clock_rate(1000 * tmp); - } - if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) - { - prop.set_memory_bus_width( - dev.get_info()); - } - if (dev.has(sycl::aspect::ext_intel_device_id)) - { - prop.set_device_id( - dev.get_info()); - } - if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) - { - prop.set_uuid(dev.get_info()); - } -#elif defined(_MSC_VER) && !defined(__clang__) -#pragma message("get_device_info: querying memory_clock_rate and \ - memory_bus_width are not supported by the compiler used. \ - Use 3200000 kHz as memory_clock_rate default value. \ - Use 64 bits as memory_bus_width default value.") -#else -#warning "get_device_info: querying memory_clock_rate and \ - memory_bus_width are not supported by the compiler used. \ - Use 3200000 kHz as memory_clock_rate default value. \ - Use 64 bits as memory_bus_width default value." -#endif - - size_t max_sub_group_size = 1; - std::vector sub_group_sizes = - dev.get_info(); - - for (const auto &sub_group_size : sub_group_sizes) - { - if (max_sub_group_size < sub_group_size) - max_sub_group_size = sub_group_size; - } - - prop.set_max_sub_group_size(max_sub_group_size); - - prop.set_max_work_items_per_compute_unit( - dev.get_info()); - int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; - prop.set_max_nd_range_size(max_nd_range_size); - - // Estimates max register size per work group, feel free to update the value - // according to device properties. - prop.set_max_register_size_per_work_group(65536); - - prop.set_global_mem_cache_size( - dev.get_info()); - out = prop; - } - - /// dpct device extension - class device_ext : public sycl::device { - typedef std::mutex mutex_type; - - public: - device_ext() : sycl::device() {} - ~device_ext() { - std::lock_guard lock(m_mutex); - clear_queues(); - } - device_ext(const sycl::device &base) : sycl::device(base) { - std::lock_guard lock(m_mutex); - init_queues(); - } - - int is_native_atomic_supported() { return 0; } - int get_major_version() const { return dpct::get_major_version(*this); } - - int get_minor_version() const { return dpct::get_minor_version(*this); } - - int get_max_compute_units() const { - return get_device_info().get_max_compute_units(); - } - - /// Return the maximum clock frequency of this device in KHz. - int get_max_clock_frequency() const { - return get_device_info().get_max_clock_frequency(); - } - - int get_integrated() const { return get_device_info().get_integrated(); } - - int get_max_sub_group_size() const { - return get_device_info().get_max_sub_group_size(); - } - - int get_max_register_size_per_work_group() const { - return get_device_info().get_max_register_size_per_work_group(); - } - - int get_max_work_group_size() const { - return get_device_info().get_max_work_group_size(); - } - - int get_mem_base_addr_align() const { - return get_info(); - } - - size_t get_global_mem_size() const { - return get_device_info().get_global_mem_size(); - } - - size_t get_max_mem_alloc_size() const { - return get_device_info().get_max_mem_alloc_size(); - } - - /// Get the number of bytes of free and total memory on the SYCL device. - /// \param [out] free_memory The number of bytes of free memory on the - /// SYCL device. \param [out] total_memory The number of bytes of total - /// memory on the SYCL device. - void get_memory_info(size_t &free_memory, size_t &total_memory) { - total_memory = get_device_info().get_global_mem_size(); - const char *warning_info = - "get_memory_info: [warning] ext_intel_free_memory is not " - "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " - "use total memory as free memory"; -#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) - if (!has(sycl::aspect::ext_intel_free_memory)) { - std::cerr << warning_info << std::endl; - free_memory = total_memory; - } else { - free_memory = get_info(); - } -#else - std::cerr << warning_info << std::endl; - free_memory = total_memory; -#if defined(_MSC_VER) && !defined(__clang__) -#pragma message("Querying the number of bytes of free memory is not supported") -#else -#warning "Querying the number of bytes of free memory is not supported" -#endif -#endif - } - - void get_device_info(device_info &out) const { - dpct::get_device_info(out, *this); - } - - device_info get_device_info() const { - device_info prop; - dpct::get_device_info(prop, *this); - return prop; - } - - void reset() { - std::lock_guard lock(m_mutex); - clear_queues(); - init_queues(); - } - - sycl::queue &in_order_queue() { return _q_in_order; } - - sycl::queue &out_of_order_queue() { return _q_out_of_order; } - - sycl::queue &default_queue() { return in_order_queue(); } - - void queues_wait_and_throw() { - std::unique_lock lock(m_mutex); - lock.unlock(); - for (auto &q : _queues) { - q.wait_and_throw(); - } - // Guard the destruct of current_queues to make sure the ref count is - // safe. - lock.lock(); - } - - sycl::queue create_queue(bool enable_exception_handler = false) { - return create_in_order_queue(enable_exception_handler); - } - - sycl::queue create_queue(sycl::device device, - bool enable_exception_handler = false) { - return create_in_order_queue(device, enable_exception_handler); - } - - sycl::queue create_in_order_queue(bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(enable_exception_handler, - sycl::property::queue::in_order()); - } - - sycl::queue create_in_order_queue(sycl::device device, - bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(device, enable_exception_handler, - sycl::property::queue::in_order()); - } - - sycl::queue create_out_of_order_queue( - bool enable_exception_handler = false) { - std::lock_guard lock(m_mutex); - return create_queue_impl(enable_exception_handler); - } - - void destroy_queue(sycl::queue queue) { - std::lock_guard lock(m_mutex); - _queues.erase(std::remove_if(_queues.begin(), _queues.end(), - [=](const sycl::queue &q) -> bool - { - return q == queue; - }), - _queues.end()); - } - void set_saved_queue(sycl::queue q) { - std::lock_guard lock(m_mutex); - _saved_queue = q; - } - sycl::queue get_saved_queue() const { - std::lock_guard lock(m_mutex); - return _saved_queue; - } - - private: - void clear_queues() { _queues.clear(); } - - void init_queues() { - _q_in_order = - create_queue_impl(true, sycl::property::queue::in_order()); - _q_out_of_order = create_queue_impl(true); - _saved_queue = default_queue(); - } - - /// Caller should acquire resource \p m_mutex before calling this - /// function. - template - sycl::queue create_queue_impl(bool enable_exception_handler, - Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - _queues.push_back(sycl::queue( - *this, eh, - sycl::property_list( -#ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), -#endif - properties...))); - - return _queues.back(); - } - - template - sycl::queue create_queue_impl(sycl::device device, - bool enable_exception_handler, - Properties... properties) { - sycl::async_handler eh = {}; - if (enable_exception_handler) { - eh = exception_handler; - } - _queues.push_back(sycl::queue( - device, eh, - sycl::property_list( -#ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), -#endif - properties...))); - - return _queues.back(); - } - - void get_version(int &major, int &minor) const { - detail::get_version(*this, major, minor); - } - sycl::queue _q_in_order, _q_out_of_order; - sycl::queue _saved_queue; - std::vector _queues; - mutable mutex_type m_mutex; - }; - - - /// device manager - class dev_mgr - { - public: - device_ext ¤t_device() - { - unsigned int dev_id = current_device_id(); - check_id(dev_id); - return *_devs[dev_id]; - } - device_ext &cpu_device() const - { - std::lock_guard lock(m_mutex); - if (_cpu_device == -1) - { - throw std::runtime_error("no valid cpu device"); - } - else - { - return *_devs[_cpu_device]; - } - } - device_ext &get_device(unsigned int id) const - { - std::lock_guard lock(m_mutex); - check_id(id); - return *_devs[id]; - } - unsigned int current_device_id() const - { - std::lock_guard lock(m_mutex); - auto it = _thread2dev_map.find(get_tid()); - if (it != _thread2dev_map.end()) - return it->second; - return DEFAULT_DEVICE_ID; - } - - /// Select device with a device ID. - /// \param [in] id The id of the device which can - /// be obtained through get_device_id(const sycl::device). - void select_device(unsigned int id) - { - std::lock_guard lock(m_mutex); - check_id(id); - _thread2dev_map[get_tid()] = id; - } - unsigned int device_count() { return _devs.size(); } - - unsigned int get_device_id(const sycl::device &dev) - { - unsigned int id = 0; - for (auto &dev_item : _devs) - { - if (*dev_item == dev) - { - return id; - } - id++; - } - return -1; - } - - inline std::string get_preferred_gpu_platform_name() { - std::string result; - - std::string filter = ""; - char* env = getenv("ONEAPI_DEVICE_SELECTOR"); - if (env) { - if (std::strstr(env, "level_zero")) { - filter = "level-zero"; - } - else if (std::strstr(env, "opencl")) { - filter = "opencl"; - } - else if (std::strstr(env, "cuda")) { - filter = "cuda"; - } - else if (std::strstr(env, "hip")) { - filter = "hip"; - } - else { - throw std::runtime_error("invalid device filter: " + std::string(env)); - } - } else { - auto default_device = sycl::device(sycl::default_selector_v); - auto default_platform_name = default_device.get_platform().get_info(); - - if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) { - filter = "level-zero"; - } - else if (std::strstr(default_platform_name.c_str(), "CUDA")) { - filter = "cuda"; - } - else if (std::strstr(default_platform_name.c_str(), "HIP")) { - filter = "hip"; - } - } - - auto platform_list = sycl::platform::get_platforms(); - - for (const auto& platform : platform_list) { - auto devices = platform.get_devices(); - auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { - return d.is_gpu(); - }); - - if (gpu_dev == devices.end()) { - // cout << "platform [" << platform_name - // << "] does not contain GPU devices, skipping\n"; - continue; - } - - auto platform_name = platform.get_info(); - std::string platform_name_low_case; - platform_name_low_case.resize(platform_name.size()); - - std::transform( - platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower); - - if (platform_name_low_case.find(filter) == std::string::npos) { - // cout << "platform [" << platform_name - // << "] does not match with requested " - // << filter << ", skipping\n"; - continue; - } - - result = platform_name; - } - - if (result.empty()) - throw std::runtime_error("can not find preferred GPU platform"); - - return result; - } - - template - std::enable_if_t< - std::is_invocable_r_v> - select_device(const DeviceSelector &selector = sycl::gpu_selector_v) - { - sycl::device selected_device = sycl::device(selector); - unsigned int selected_device_id = get_device_id(selected_device); - select_device(selected_device_id); - } - - /// Returns the instance of device manager singleton. - static dev_mgr &instance() - { - static dev_mgr d_m; - return d_m; - } - dev_mgr(const dev_mgr &) = delete; - dev_mgr &operator=(const dev_mgr &) = delete; - dev_mgr(dev_mgr &&) = delete; - dev_mgr &operator=(dev_mgr &&) = delete; - - private: - mutable std::recursive_mutex m_mutex; - static bool compare_dev(sycl::device &device1, sycl::device &device2) - { - sycl::backend backend1 = device1.get_backend(); - sycl::backend backend2 = device2.get_backend(); - // levelzero backends always come first - if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true; - if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false; - dpct::device_info prop1; - dpct::get_device_info(prop1, device1); - dpct::device_info prop2; - dpct::get_device_info(prop2, device2); - return prop1.get_max_compute_units() > prop2.get_max_compute_units(); - } - static int convert_backend_index(std::string & backend) { - if (backend == "ext_oneapi_level_zero:gpu") return 0; - if (backend == "opencl:gpu") return 1; - if (backend == "ext_oneapi_cuda:gpu") return 2; - if (backend == "ext_oneapi_hip:gpu") return 3; - if (backend == "opencl:cpu") return 4; - if (backend == "opencl:acc") return 5; - printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ABORT("fatal error"); - } - static bool compare_backend(std::string &backend1, std::string &backend2) { - return convert_backend_index(backend1) < convert_backend_index(backend2); - } - dev_mgr() - { - sycl::device default_device = - sycl::device(sycl::default_selector_v); - _devs.push_back(std::make_shared(default_device)); - - std::vector sycl_all_devs; - // Collect other devices except for the default device. - if (default_device.is_cpu()) - _cpu_device = 0; - - auto Platforms = sycl::platform::get_platforms(); - // Keep track of the number of devices per backend - std::map DeviceNums; - std::map> backend_devices; - auto preferred_platform_name = get_preferred_gpu_platform_name(); - - while (!Platforms.empty()) { - auto Platform = Platforms.back(); - Platforms.pop_back(); - auto platform_name = Platform.get_info(); - if (platform_name.compare(preferred_platform_name) != 0) { - continue; - } - auto devices = Platform.get_devices(); - std::string backend_type = get_device_backend_and_type(devices[0]); - for (const auto &device : devices) { - backend_devices[backend_type].push_back(device); - } - } - - std::vector keys; - for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { - keys.push_back(it->first); - } - std::sort(keys.begin(), keys.end(), compare_backend); - - for (auto &key : keys) { - std::vector devs = backend_devices[key]; - std::sort(devs.begin(), devs.end(), compare_dev); - for (const auto &dev : devs) { - sycl_all_devs.push_back(dev); - } - } - - for (auto &dev : sycl_all_devs) - { - if (dev == default_device) - { - continue; - } - _devs.push_back(std::make_shared(dev)); - if (_cpu_device == -1 && dev.is_cpu()) - { - _cpu_device = _devs.size() - 1; - } - } - } - void check_id(unsigned int id) const - { - if (id >= _devs.size()) - { - throw std::runtime_error("invalid device id"); - } - } - std::vector> _devs; - /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current - /// thread id in _thread2dev_map, which means default device should be used - /// for the current thread. - const unsigned int DEFAULT_DEVICE_ID = 0; - /// thread-id to device-id map. - std::map _thread2dev_map; - int _cpu_device = -1; - }; - - static inline sycl::queue &get_default_queue() - { - return dev_mgr::instance().current_device().default_queue(); - } - - namespace detail - { - enum class pointer_access_attribute - { - host_only = 0, - device_only, - host_device, - end - }; - - static pointer_access_attribute get_pointer_attribute(sycl::queue &q, - const void *ptr) - { - switch (sycl::get_pointer_type(ptr, q.get_context())) - { - case sycl::usm::alloc::unknown: - return pointer_access_attribute::host_only; - case sycl::usm::alloc::device: - return pointer_access_attribute::device_only; - case sycl::usm::alloc::shared: - case sycl::usm::alloc::host: - return pointer_access_attribute::host_device; - } - } - - template - inline constexpr std::uint64_t get_type_combination_id(ArgT Val) - { - static_assert((unsigned char)library_data_t::library_data_t_size <= - std::numeric_limits::max() && - "library_data_t size exceeds limit."); - static_assert(std::is_same_v, "Unsupported ArgT"); - return (std::uint64_t)Val; - } - - template - inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal, - RestT... RestVal) - { - static_assert((std::uint8_t)library_data_t::library_data_t_size <= - std::numeric_limits::max() && - "library_data_t size exceeds limit."); - static_assert(sizeof...(RestT) <= 8 && "Too many parameters"); - static_assert(std::is_same_v, "Unsupported FirstT"); - return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal); - } - - class mem_mgr - { - mem_mgr() - { - // Reserved address space, no real memory allocation happens here. -#if defined(__linux__) - mapped_address_space = - (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -#elif defined(_WIN64) - mapped_address_space = (byte_t *)VirtualAlloc( - NULL, // NULL specified as the base address parameter - mapped_region_size, // Size of allocation - MEM_RESERVE, // Allocate reserved pages - PAGE_NOACCESS); // Protection = no access -#else -#error "Only support Windows and Linux." -#endif - next_free = mapped_address_space; - } - - public: - using buffer_id_t = int; - - struct allocation - { - buffer_t buffer; - byte_t *alloc_ptr; - size_t size; - }; - - ~mem_mgr() - { -#if defined(__linux__) - munmap(mapped_address_space, mapped_region_size); -#elif defined(_WIN64) - VirtualFree(mapped_address_space, 0, MEM_RELEASE); -#else -#error "Only support Windows and Linux." -#endif - } - - mem_mgr(const mem_mgr &) = delete; - mem_mgr &operator=(const mem_mgr &) = delete; - mem_mgr(mem_mgr &&) = delete; - mem_mgr &operator=(mem_mgr &&) = delete; - - /// Allocate - void *mem_alloc(size_t size) - { - if (!size) - return nullptr; - std::lock_guard lock(m_mutex); - if (next_free + size > mapped_address_space + mapped_region_size) - { - throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool"); - } - // Allocation - sycl::range<1> r(size); - buffer_t buf(r); - allocation A{buf, next_free, size}; - // Map allocation to device pointer - void *result = next_free; - m_map.emplace(next_free + size, A); - // Update pointer to the next free space. - next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1); - - return result; - } - - /// Deallocate - void mem_free(const void *ptr) - { - if (!ptr) - return; - std::lock_guard lock(m_mutex); - auto it = get_map_iterator(ptr); - m_map.erase(it); - } - - /// map: device pointer -> allocation(buffer, alloc_ptr, size) - allocation translate_ptr(const void *ptr) - { - std::lock_guard lock(m_mutex); - auto it = get_map_iterator(ptr); - return it->second; - } - - /// Check if the pointer represents device pointer or not. - bool is_device_ptr(const void *ptr) const - { - std::lock_guard lock(m_mutex); - return (mapped_address_space <= ptr) && - (ptr < mapped_address_space + mapped_region_size); - } - - /// Returns the instance of memory manager singleton. - static mem_mgr &instance() - { - static mem_mgr m; - return m; - } - - private: - std::map m_map; - mutable std::mutex m_mutex; - byte_t *mapped_address_space; - byte_t *next_free; - const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; - const size_t alignment = 256; - /// This padding may be defined to some positive value to debug - /// out of bound accesses. - const size_t extra_padding = 0; - - std::map::iterator get_map_iterator(const void *ptr) - { - auto it = m_map.upper_bound(const_cast(reinterpret_cast(ptr))); - if (it == m_map.end()) - { - // Not a virtual pointer. - throw std::runtime_error("can not get buffer from non-virtual pointer"); - } - const allocation &alloc = it->second; - if (ptr < alloc.alloc_ptr) - { - // Out of bound. - // This may happen if there's a gap between allocations due to alignment - // or extra padding and pointer points to this gap. - throw std::runtime_error("invalid virtual pointer"); - } - return it; - } - }; - - template - class accessor; - template - class memory_traits - { - public: - static constexpr sycl::access::target target = - sycl::access::target::device; - static constexpr sycl::access_mode mode = - (Memory == constant) ? sycl::access_mode::read - : sycl::access_mode::read_write; - static constexpr size_t type_size = sizeof(T); - using element_t = - typename std::conditional::type; - using value_t = typename std::remove_cv::type; - template - using accessor_t = typename std::conditional< - Memory == local, sycl::local_accessor, - sycl::accessor>::type; - using pointer_t = T *; - }; - - static inline void *dpct_malloc(size_t size, sycl::queue &q) - { - return sycl::malloc_device(size, q.get_device(), q.get_context()); - } - -#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) - static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z, - sycl::queue &q) - { - pitch = PITCH_DEFAULT_ALIGN(x); - return dpct_malloc(pitch * y * z, q); - } - - /** - * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q. - * @tparam valueT The type of the element to be set. - * @param [in] q The queue in which the operation is done. - * @param [in] dev_ptr Pointer to the virtual device memory address. - * @param [in] value The value to be set. - * @param [in] size Number of elements to be set to the value. - * @return An event representing the memset operation. - */ - template - static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, - valueT value, size_t size) - { - return q.fill(dev_ptr, value, size); - } - - /** - * @brief Sets \p value to the 3D memory region pointed by \p data in \p q. - * @tparam valueT The type of the element to be set. - * @param [in] q The queue in which the operation is done. - * @param [in] data Pointer to the pitched device memory region. - * @param [in] value The value to be set. - * @param [in] size 3D memory region by number of elements. - * @return An event list representing the memset operations. - */ - template - static inline std::vector - dpct_memset(sycl::queue &q, pitched_data data, valueT value, - sycl::range<3> size) - { - std::vector event_list; - size_t slice = data.get_pitch() * data.get_y(); - unsigned char *data_surface = (unsigned char *)data.get_data_ptr(); - for (size_t z = 0; z < size.get(2); ++z) - { - unsigned char *data_ptr = data_surface; - for (size_t y = 0; y < size.get(1); ++y) - { - event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0))); - data_ptr += data.get_pitch(); - } - data_surface += slice; - } - return event_list; - } - - /** - * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q. - * @tparam valueT The type of the element to be set. - * @param [in] q The queue in which the operation is done. - * @param [in] ptr Pointer to the virtual device memory. - * @param [in] pitch The pitch size by number of elements, including padding. - * @param [in] val The value to be set. - * @param [in] x The width of memory region by number of elements. - * @param [in] y The height of memory region by number of elements. - * @return An event list representing the memset operations. - */ - template - static inline std::vector - dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x, - size_t y) - { - return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val, - sycl::range<3>(x, y, 1)); - } - - static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr, - const void *from_ptr, - memcpy_direction dir) - { - switch (dir) - { - case memcpy_direction::host_to_host: - case memcpy_direction::host_to_device: - case memcpy_direction::device_to_host: - case memcpy_direction::device_to_device: - return dir; - case memcpy_direction::automatic: - { - // table[to_attribute][from_attribute] - static const memcpy_direction - direction_table[static_cast(pointer_access_attribute::end)] - [static_cast(pointer_access_attribute::end)] = - {{memcpy_direction::host_to_host, - memcpy_direction::device_to_host, - memcpy_direction::host_to_host}, - {memcpy_direction::host_to_device, - memcpy_direction::device_to_device, - memcpy_direction::device_to_device}, - {memcpy_direction::host_to_host, - memcpy_direction::device_to_device, - memcpy_direction::device_to_device}}; - return direction_table[static_cast(get_pointer_attribute( - q, to_ptr))][static_cast(get_pointer_attribute(q, from_ptr))]; - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } - } - - static sycl::event - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, - memcpy_direction direction, - const std::vector &dep_events = {}) - { - if (!size) - return sycl::event{}; - return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); - } - - // Get actual copy range and make sure it will not exceed range. - static inline size_t get_copy_range(sycl::range<3> size, size_t slice, - size_t pitch) - { - return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); - } - - static inline size_t get_offset(sycl::id<3> id, size_t slice, - size_t pitch) - { - return slice * id.get(2) + pitch * id.get(1) + id.get(0); - } - - /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr - /// and \p from_range to another specified by \p to_ptr and \p to_range. - static inline std::vector - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - sycl::range<3> to_range, sycl::range<3> from_range, - sycl::id<3> to_id, sycl::id<3> from_id, - sycl::range<3> size, memcpy_direction direction, - const std::vector &dep_events = {}) - { - // RAII for host pointer - class host_buffer - { - void *_buf; - size_t _size; - sycl::queue &_q; - const std::vector &_deps; // free operation depends - - public: - host_buffer(size_t size, sycl::queue &q, - const std::vector &deps) - : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} - void *get_ptr() const { return _buf; } - size_t get_size() const { return _size; } - ~host_buffer() - { - if (_buf) - { - _q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(_deps); - cgh.host_task([buf = _buf] { std::free(buf); }); }); - } - } - }; - std::vector event_list; - - size_t to_slice = to_range.get(1) * to_range.get(0), - from_slice = from_range.get(1) * from_range.get(0); - unsigned char *to_surface = - (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); - const unsigned char *from_surface = - (const unsigned char *)from_ptr + - get_offset(from_id, from_slice, from_range.get(0)); - - if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) - { - return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), - direction, dep_events)}; - } - direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - size_t size_slice = size.get(1) * size.get(0); - switch (direction) - { - case host_to_host: - for (size_t z = 0; z < size.get(2); ++z) - { - unsigned char *to_ptr = to_surface; - const unsigned char *from_ptr = from_surface; - if (to_range.get(0) == from_range.get(0) && - to_range.get(0) == size.get(0)) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, - direction, dep_events)); - } - else - { - for (size_t y = 0; y < size.get(1); ++y) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), - direction, dep_events)); - to_ptr += to_range.get(0); - from_ptr += from_range.get(0); - } - } - to_surface += to_slice; - from_surface += from_slice; - } - break; - case host_to_device: - { - host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, - event_list); - std::vector host_events; - if (to_slice == size_slice) - { - // Copy host data to a temp host buffer with the shape of target. - host_events = - dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, - host_to_host, dep_events); - } - else - { - // Copy host data to a temp host buffer with the shape of target. - host_events = dpct_memcpy( - q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, - // If has padding data, not sure whether it is useless. So fill temp - // buffer with it. - std::vector{ - dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), - device_to_host, dep_events)}); - } - // Copy from temp host buffer to device with only one submit. - event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), - buf.get_size(), host_to_device, - host_events)); - break; - } - case device_to_host: - { - host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, - event_list); - // Copy from host temp buffer to host target with reshaping. - event_list = dpct_memcpy( - q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), - sycl::id<3>(0, 0, 0), size, host_to_host, - // Copy from device to temp host buffer with only one submit. - std::vector{dpct_memcpy(q, buf.get_ptr(), from_surface, - buf.get_size(), - device_to_host, dep_events)}); - break; - } - case device_to_device: - event_list.push_back(q.submit([&](sycl::handler &cgh){ - cgh.depends_on(dep_events); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); - break; - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } - return event_list; - } - - /// memcpy 2D/3D matrix specified by pitched_data. - static inline std::vector - dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, - pitched_data from, sycl::id<3> from_id, sycl::range<3> size, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), - sycl::range<3>(to.get_pitch(), to.get_y(), 1), - sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, - size, direction); - } - - /// memcpy 2D matrix with pitch. - static inline std::vector - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - size_t to_pitch, size_t from_pitch, size_t x, size_t y, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), - sycl::range<3>(from_pitch, y, 1), - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), - sycl::range<3>(x, y, 1), direction); - } - - namespace deprecated - { - - template - class usm_allocator - { - private: - using Alloc = sycl::usm_allocator; - Alloc _impl; - - public: - using value_type = typename std::allocator_traits::value_type; - using pointer = typename std::allocator_traits::pointer; - using const_pointer = typename std::allocator_traits::const_pointer; - using void_pointer = typename std::allocator_traits::void_pointer; - using const_void_pointer = - typename std::allocator_traits::const_void_pointer; - using reference = typename std::allocator_traits::value_type &; - using const_reference = - const typename std::allocator_traits::value_type &; - using difference_type = - typename std::allocator_traits::difference_type; - using size_type = typename std::allocator_traits::size_type; - using propagate_on_container_copy_assignment = typename std::allocator_traits< - Alloc>::propagate_on_container_copy_assignment; - using propagate_on_container_move_assignment = typename std::allocator_traits< - Alloc>::propagate_on_container_move_assignment; - using propagate_on_container_swap = - typename std::allocator_traits::propagate_on_container_swap; - using is_always_equal = - typename std::allocator_traits::is_always_equal; - - template - struct rebind - { - typedef usm_allocator other; - }; - - usm_allocator() : _impl(dpct::get_default_queue()) {} - ~usm_allocator() {} - usm_allocator(const usm_allocator &other) : _impl(other._impl) {} - usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {} - pointer address(reference r) { return &r; } - const_pointer address(const_reference r) { return &r; } - pointer allocate(size_type cnt, const_void_pointer hint = nullptr) - { - return std::allocator_traits::allocate(_impl, cnt, hint); - } - void deallocate(pointer p, size_type cnt) - { - std::allocator_traits::deallocate(_impl, p, cnt); - } - size_type max_size() const - { - return std::allocator_traits::max_size(_impl); - } - bool operator==(const usm_allocator &other) const { return _impl == other._impl; } - bool operator!=(const usm_allocator &other) const { return _impl != other._impl; } - }; - - } // namespace deprecated - - inline void dpct_free(void *ptr, - const sycl::queue &q) - { - if (ptr) - { - sycl::free(ptr, q.get_context()); - } - } - - template - inline auto get_memory(const void *x) - { - T *new_x = reinterpret_cast(const_cast(x)); - return new_x; - } - - template - inline typename DataType::T2 get_value(const T *s, sycl::queue &q) - { - using Ty = typename DataType::T2; - Ty s_h; - if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) - detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host) - .wait(); - else - s_h = *reinterpret_cast(s); - return s_h; - } - - } // namespace detail - - template - inline auto get_value(const T *s, sycl::queue &q) - { - return detail::get_value(s, q); - } - - namespace detail - { - template - inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, - int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb, - const void * beta, void * c, int ldc) { - Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); - Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); - auto data_a = get_memory(a); - auto data_b = get_memory(b); - auto data_c = get_memory(c); - oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, - lda, data_b, ldb, beta_value, data_c, ldc); - } - - template - class vectorized_binary - { - public: - inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) - { - VecT v4; - for (size_t i = 0; i < v4.size(); ++i) - { - v4[i] = binary_op(a[i], b[i]); - } - return v4; - } - }; - - template - class vectorized_binary< - VecT, BinaryOperation, - std::void_t>> - { - public: - inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) - { - return binary_op(a, b).template as(); - } - }; - - template - inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, - int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b, - int ldb, const void * beta, void ** c, int ldc, int batch_size, - matrix_info_t * matrix_info) { - Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); - Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); - - matrix_info->transpose_info[0] = a_trans; - matrix_info->transpose_info[1] = b_trans; - matrix_info->value_info[0] = alpha_value; - matrix_info->value_info[1] = beta_value; - matrix_info->size_info[0] = m; - matrix_info->size_info[1] = n; - matrix_info->size_info[2] = k; - matrix_info->ld_info[0] = lda; - matrix_info->ld_info[1] = ldb; - matrix_info->ld_info[2] = ldc; - matrix_info->groupsize_info = batch_size; - - sycl::event e = oneapi::math::blas::column_major::gemm_batch( - get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1, - matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2, - reinterpret_cast(matrix_info->value_info), reinterpret_cast(a), matrix_info->ld_info, - reinterpret_cast(b), matrix_info->ld_info + 1, - reinterpret_cast(matrix_info->value_info + 1), reinterpret_cast(c), - matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); - } - - template - inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, - int m, int n, int k, const void * alpha, const void * a, int lda, - long long int stride_a, const void * b, int ldb, long long int stride_b, - const void * beta, void * c, int ldc, long long int stride_c, int batch_size) { - Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); - Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); - auto data_a = get_memory(a); - auto data_b = get_memory(b); - auto data_c = get_memory(c); - oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, - data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, - data_c, ldc, stride_c, batch_size); - } - - } // namespace detail - - template - inline unsigned vectorized_binary(unsigned a, unsigned b, - const BinaryOperation binary_op) - { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.as(); - auto v3 = v1.as(); - auto v4 = - detail::vectorized_binary()(v2, v3, binary_op); - v0 = v4.template as>(); - return v0; - } - - static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size, - memcpy_direction direction = automatic, - sycl::queue &q = dpct::get_default_queue()) - { - detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction); - } - - static inline unsigned int select_device(unsigned int id) - { - dev_mgr::instance().select_device(id); - return id; - } - - template - T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, - unsigned int logical_sub_group_size = 32) - { - unsigned int id = g.get_local_linear_id(); - unsigned int start_index = - id / logical_sub_group_size * logical_sub_group_size; - unsigned int target_offset = (id % logical_sub_group_size) ^ mask; - return sycl::select_from_group(g, x, - target_offset < logical_sub_group_size - ? start_index + target_offset - : id); - } - - template - inline auto dp4a(T1 a, T2 b, T3 c) - { - return syclcompat::dp4a(a, b, c); - } - - struct sub_sat - { - template - auto operator()(const T x, const T y) const - { - return sycl::sub_sat(x, y); - } - }; - - template - inline T vectorized_min(T a, T b) - { - sycl::vec v0{a}, v1{b}; - auto v2 = v0.template as(); - auto v3 = v1.template as(); - auto v4 = sycl::min(v2, v3); - v0 = v4.template as>(); - return v0; - } - - inline float pow(const float a, const int b) { return sycl::pown(a, b); } - inline double pow(const double a, const int b) { return sycl::pown(a, b); } - inline float pow(const float a, const float b) { return sycl::pow(a, b); } - inline double pow(const double a, const double b) { return sycl::pow(a, b); } - template - inline typename std::enable_if_t, T> - pow(const T a, const U b) - { - return sycl::pow(a, static_cast(b)); - } - template - inline typename std::enable_if_t, double> - pow(const T a, const U b) - { - return sycl::pow(static_cast(a), static_cast(b)); - } - - inline double min(const double a, const float b) - { - return sycl::fmin(a, static_cast(b)); - } - inline double min(const float a, const double b) - { - return sycl::fmin(static_cast(a), b); - } - inline float min(const float a, const float b) { return sycl::fmin(a, b); } - inline double min(const double a, const double b) { return sycl::fmin(a, b); } - inline std::uint32_t min(const std::uint32_t a, const std::int32_t b) - { - return sycl::min(a, static_cast(b)); - } - inline std::uint32_t min(const std::int32_t a, const std::uint32_t b) - { - return sycl::min(static_cast(a), b); - } - inline std::int32_t min(const std::int32_t a, const std::int32_t b) - { - return sycl::min(a, b); - } - inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b) - { - return sycl::min(a, b); - } - inline std::uint64_t min(const std::uint64_t a, const std::int64_t b) - { - return sycl::min(a, static_cast(b)); - } - inline std::uint64_t min(const std::int64_t a, const std::uint64_t b) - { - return sycl::min(static_cast(a), b); - } - inline std::int64_t min(const std::int64_t a, const std::int64_t b) - { - return sycl::min(a, b); - } - inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b) - { - return sycl::min(a, b); - } - inline std::uint64_t min(const std::uint64_t a, const std::int32_t b) - { - return sycl::min(a, static_cast(b)); - } - inline std::uint64_t min(const std::int32_t a, const std::uint64_t b) - { - return sycl::min(static_cast(a), b); - } - inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b) - { - return sycl::min(a, static_cast(b)); - } - inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b) - { - return sycl::min(static_cast(a), b); - } - // max function overloads. - // For floating-point types, `float` or `double` arguments are acceptable. - // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or - // `std::int64_t` type arguments are acceptable. - inline double max(const double a, const float b) - { - return sycl::fmax(a, static_cast(b)); - } - inline double max(const float a, const double b) - { - return sycl::fmax(static_cast(a), b); - } - inline float max(const float a, const float b) { return sycl::fmax(a, b); } - inline double max(const double a, const double b) { return sycl::fmax(a, b); } - inline std::uint32_t max(const std::uint32_t a, const std::int32_t b) - { - return sycl::max(a, static_cast(b)); - } - inline std::uint32_t max(const std::int32_t a, const std::uint32_t b) - { - return sycl::max(static_cast(a), b); - } - inline std::int32_t max(const std::int32_t a, const std::int32_t b) - { - return sycl::max(a, b); - } - inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b) - { - return sycl::max(a, b); - } - inline std::uint64_t max(const std::uint64_t a, const std::int64_t b) - { - return sycl::max(a, static_cast(b)); - } - inline std::uint64_t max(const std::int64_t a, const std::uint64_t b) - { - return sycl::max(static_cast(a), b); - } - inline std::int64_t max(const std::int64_t a, const std::int64_t b) - { - return sycl::max(a, b); - } - inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b) - { - return sycl::max(a, b); - } - inline std::uint64_t max(const std::uint64_t a, const std::int32_t b) - { - return sycl::max(a, static_cast(b)); - } - inline std::uint64_t max(const std::int32_t a, const std::uint64_t b) - { - return sycl::max(static_cast(a), b); - } - inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b) - { - return sycl::max(a, static_cast(b)); - } - inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b) - { - return sycl::max(static_cast(a), b); - } - - inline void - has_capability_or_fail(const sycl::device &dev, - const std::initializer_list &props) - { - for (const auto &it : props) - { - if (dev.has(it)) - continue; - switch (it) - { - case sycl::aspect::fp64: - throw std::runtime_error("'double' is not supported in '" + - dev.get_info() + - "' device"); - break; - case sycl::aspect::fp16: - throw std::runtime_error("'half' is not supported in '" + - dev.get_info() + - "' device"); - break; - default: -#define __SYCL_ASPECT(ASPECT, ID) \ - case sycl::aspect::ASPECT: \ - return #ASPECT; -#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) -#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) - auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string - { - switch (AspectNum) - { -#include -#include - default: - return "unknown aspect"; - } - }; -#undef __SYCL_ASPECT_DEPRECATED_ALIAS -#undef __SYCL_ASPECT_DEPRECATED -#undef __SYCL_ASPECT - throw std::runtime_error( - "'" + getAspectNameStr(it) + "' is not supported in '" + - dev.get_info() + "' device"); - } - break; - } - } - - static inline unsigned int get_current_device_id() - { - return dev_mgr::instance().current_device_id(); - } - - static inline device_ext &get_current_device() - { - return dev_mgr::instance().current_device(); - } - - static inline device_ext &get_device(unsigned int id) - { - return dev_mgr::instance().get_device(id); - } - - static inline sycl::queue &get_in_order_queue() - { - return dev_mgr::instance().current_device().in_order_queue(); - } - - static sycl::event - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, - memcpy_direction direction, - const std::vector &dep_events = {}) - { - if (!size) - return sycl::event{}; - return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); - } - - // Get actual copy range and make sure it will not exceed range. - static inline size_t get_copy_range(sycl::range<3> size, size_t slice, - size_t pitch) - { - return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); - } - - static inline size_t get_offset(sycl::id<3> id, size_t slice, - size_t pitch) - { - return slice * id.get(2) + pitch * id.get(1) + id.get(0); - } - - /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr - /// and \p from_range to another specified by \p to_ptr and \p to_range. - static inline std::vector - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - sycl::range<3> to_range, sycl::range<3> from_range, - sycl::id<3> to_id, sycl::id<3> from_id, - sycl::range<3> size, memcpy_direction direction, - const std::vector &dep_events = {}) - { - // RAII for host pointer - class host_buffer - { - void *_buf; - size_t _size; - sycl::queue &_q; - const std::vector &_deps; // free operation depends - - public: - host_buffer(size_t size, sycl::queue &q, - const std::vector &deps) - : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} - void *get_ptr() const { return _buf; } - size_t get_size() const { return _size; } - ~host_buffer() - { - if (_buf) - { - _q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(_deps); - cgh.host_task([buf = _buf] { std::free(buf); }); }); - } - } - }; - std::vector event_list; - - size_t to_slice = to_range.get(1) * to_range.get(0), - from_slice = from_range.get(1) * from_range.get(0); - unsigned char *to_surface = - (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); - const unsigned char *from_surface = - (const unsigned char *)from_ptr + - get_offset(from_id, from_slice, from_range.get(0)); - - if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) - { - return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), - direction, dep_events)}; - } - direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - size_t size_slice = size.get(1) * size.get(0); - switch (direction) - { - case host_to_host: - for (size_t z = 0; z < size.get(2); ++z) - { - unsigned char *to_ptr = to_surface; - const unsigned char *from_ptr = from_surface; - if (to_range.get(0) == from_range.get(0) && - to_range.get(0) == size.get(0)) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, - direction, dep_events)); - } - else - { - for (size_t y = 0; y < size.get(1); ++y) - { - event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), - direction, dep_events)); - to_ptr += to_range.get(0); - from_ptr += from_range.get(0); - } - } - to_surface += to_slice; - from_surface += from_slice; - } - break; - case host_to_device: - { - host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, - event_list); - std::vector host_events; - if (to_slice == size_slice) - { - // Copy host data to a temp host buffer with the shape of target. - host_events = - dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, - host_to_host, dep_events); - } - else - { - // Copy host data to a temp host buffer with the shape of target. - host_events = dpct_memcpy( - q, buf.get_ptr(), from_surface, to_range, from_range, - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, - // If has padding data, not sure whether it is useless. So fill temp - // buffer with it. - std::vector{ - dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), - device_to_host, dep_events)}); - } - // Copy from temp host buffer to device with only one submit. - event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), - buf.get_size(), host_to_device, - host_events)); - break; - } - case device_to_host: - { - host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, - event_list); - // Copy from host temp buffer to host target with reshaping. - event_list = dpct_memcpy( - q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), - sycl::id<3>(0, 0, 0), size, host_to_host, - // Copy from device to temp host buffer with only one submit. - std::vector{dpct_memcpy(q, buf.get_ptr(), from_surface, - buf.get_size(), - device_to_host, dep_events)}); - break; - } - case device_to_device: - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); - break; - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } - return event_list; - } - - /// memcpy 2D/3D matrix specified by pitched_data. - static inline std::vector - dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, - pitched_data from, sycl::id<3> from_id, sycl::range<3> size, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), - sycl::range<3>(to.get_pitch(), to.get_y(), 1), - sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, - size, direction); - } - - /// memcpy 2D matrix with pitch. - static inline std::vector - dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, - size_t to_pitch, size_t from_pitch, size_t x, size_t y, - memcpy_direction direction = automatic) - { - return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), - sycl::range<3>(from_pitch, y, 1), - sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), - sycl::range<3>(x, y, 1), direction); - } - - inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n, - int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b, - library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc, - library_data_t scaling_type) { - if (scaling_type == library_data_t::real_float && - c_type == library_data_t::complex_float) - { - scaling_type = library_data_t::complex_float; - } - else if (scaling_type == library_data_t::real_double && - c_type == library_data_t::complex_double) - { - scaling_type = library_data_t::complex_double; - } - - std::uint64_t key = - detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); - switch (key) - { - case detail::get_type_combination_id( - library_data_t::real_float, library_data_t::real_float, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_double, library_data_t::real_double, - library_data_t::real_double, library_data_t::real_double): - { - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_float, library_data_t::complex_float, - library_data_t::complex_float, library_data_t::complex_float): - { - detail::gemm_impl, std::complex, - std::complex, std::complex>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_double, library_data_t::complex_double, - library_data_t::complex_double, library_data_t::complex_double): - { - detail::gemm_impl, std::complex, - std::complex, std::complex>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_half): - { - detail::gemm_impl(q, a_trans, b_trans, m, n, k, alpha, a, - lda, b, ldb, beta, c, ldc); - break; - } -#ifdef __INTEL_MKL__ - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_float): - { - float alpha_value = - dpct::get_value(reinterpret_cast(alpha), q); - float beta_value = - dpct::get_value(reinterpret_cast(beta), q); - sycl::half alpha_half(alpha_value); - sycl::half beta_half(beta_value); - detail::gemm_impl(q, a_trans, b_trans, m, n, k, &alpha_half, - a, lda, b, ldb, &beta_half, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_bfloat16, library_data_t::real_float): - { - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_int32, library_data_t::real_int32): - { - float alpha_float = - dpct::get_value(reinterpret_cast(alpha), q); - float beta_float = - dpct::get_value(reinterpret_cast(beta), q); - detail::gemm_impl( - q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); - break; - } -#endif // __INTEL_MKL__ - default: - throw std::runtime_error("the combination of data type is unsupported"); - } - } // gemm() - - /// Computes a batch of matrix-matrix product with general matrices. - /// \param [in] q The queue where the routine should be executed. - /// \param [in] a_trans Specifies the operation applied to A. - /// \param [in] b_trans Specifies the operation applied to B. - /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. - /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. - /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). - /// \param [in] alpha Scaling factor for the matrix-matrix product. - /// \param [in] a Input matrix A. - /// \param [in] a_type Data type of the matrix A. - /// \param [in] lda Leading dimension of A. - /// \param [in] b Input matrix B. - /// \param [in] b_type Data type of the matrix B. - /// \param [in] ldb Leading dimension of B. - /// \param [in] beta Scaling factor for matrix C. - /// \param [in, out] c Input/Output matrix C. - /// \param [in] c_type Data type of the matrix C. - /// \param [in] ldc Leading dimension of C. - /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. - /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, - int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda, - const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[], - library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type, - matrix_info_t * matrix_info) { - std::uint64_t key = - detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); - switch (key) - { - case detail::get_type_combination_id( - library_data_t::real_float, library_data_t::real_float, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, batch_size, matrix_info); - break; - } - case detail::get_type_combination_id( - library_data_t::real_double, library_data_t::real_double, - library_data_t::real_double, library_data_t::real_double): - { - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, - beta, c, ldc, batch_size, matrix_info); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_half): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); - break; - } -#ifdef __INTEL_MKL__ - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_bfloat16, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); - break; - } - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); - break; - } -#endif - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_int32, library_data_t::real_int32): - { - float alpha_float = - dpct::get_value(reinterpret_cast(alpha), q); - float beta_float = - dpct::get_value(reinterpret_cast(beta), q); - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size, - matrix_info); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_float): - { - float alpha_value = - dpct::get_value(reinterpret_cast(alpha), q); - float beta_value = - dpct::get_value(reinterpret_cast(beta), q); - sycl::half alpha_half(alpha_value); - sycl::half beta_half(beta_value); - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info); - break; - } - default: - throw std::runtime_error("the combination of data type is unsupported"); - } - } - - /// Computes a batch of matrix-matrix product with general matrices. - /// \param [in] q The queue where the routine should be executed. - /// \param [in] a_trans Specifies the operation applied to A. - /// \param [in] b_trans Specifies the operation applied to B. - /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. - /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. - /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). - /// \param [in] alpha Scaling factor for the matrix-matrix product. - /// \param [in] a Input matrix A. - /// \param [in] a_type Data type of the matrix A. - /// \param [in] lda Leading dimension of A. - /// \param [in] stride_a Stride between the different A matrices. - /// \param [in] b Input matrix B. - /// \param [in] b_type Data type of the matrix B. - /// \param [in] ldb Leading dimension of B. - /// \param [in] stride_b Stride between the different B matrices. - /// \param [in] beta Scaling factor for matrix C. - /// \param [in, out] c Input/Output matrix C. - /// \param [in] c_type Data type of the matrix C. - /// \param [in] ldc Leading dimension of C. - /// \param [in] stride_c Stride between the different C matrices. - /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. - /// \param [in] scaling_type Data type of the scaling factors. - inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, - int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda, - long long int stride_a, const void * b, library_data_t b_type, int ldb, - long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc, - long long int stride_c, int batch_size, library_data_t scaling_type) { - if (scaling_type == library_data_t::real_float && - c_type == library_data_t::complex_float) - { - scaling_type = library_data_t::complex_float; - } - else if (scaling_type == library_data_t::real_double && - c_type == library_data_t::complex_double) - { - scaling_type = library_data_t::complex_double; - } - - std::uint64_t key = - detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); - switch (key) - { - case detail::get_type_combination_id( - library_data_t::real_float, library_data_t::real_float, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_double, library_data_t::real_double, - library_data_t::real_double, library_data_t::real_double): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_float, library_data_t::complex_float, - library_data_t::complex_float, library_data_t::complex_float): - { - detail::gemm_batch_impl, std::complex, - std::complex, std::complex>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::complex_double, library_data_t::complex_double, - library_data_t::complex_double, library_data_t::complex_double): - { - detail::gemm_batch_impl, std::complex, - std::complex, std::complex>( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_half): - { - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, - a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } -#ifdef __INTEL_MKL__ - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_bfloat16, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, - batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_bfloat16, library_data_t::real_bfloat16, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, - batch_size); - break; - } -#endif - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_int32, library_data_t::real_int32): - { - detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, - a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_int8, library_data_t::real_int8, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_float, library_data_t::real_float): - { - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, - beta, c, ldc, stride_c, batch_size); - break; - } - case detail::get_type_combination_id( - library_data_t::real_half, library_data_t::real_half, - library_data_t::real_half, library_data_t::real_float): - { - float alpha_value = - dpct::get_value(reinterpret_cast(alpha), q); - float beta_value = - dpct::get_value(reinterpret_cast(beta), q); - sycl::half alpha_half(alpha_value); - sycl::half beta_half(beta_value); - detail::gemm_batch_impl( - q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b, - &beta_half, c, ldc, stride_c, batch_size); - break; - } - default: - throw std::runtime_error("the combination of data type is unsupported"); - } - } - - static inline void - async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, - size_t from_pitch, size_t x, size_t y, - memcpy_direction direction = automatic, - sycl::queue &q = get_default_queue()) - { - detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y, - direction); - } - - using err0 = detail::generic_error_type; - using err1 = detail::generic_error_type; - - static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) { - detail::dpct_free(ptr, q); - } - - /// dpct accessor used as device function parameter. - template class accessor; - template class accessor { - public: - using memory_t = detail::memory_traits; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<3>; - accessor(pointer_t data, const sycl::range<3> &in_range) - : _data(data), _range(in_range) {} - template - accessor(typename std::enable_if::type &acc) - : accessor(acc, acc.get_range()) {} - accessor(const accessor_t &acc, const sycl::range<3> &in_range) - : accessor(acc.get_pointer(), in_range) {} - accessor operator[](size_t index) const { - sycl::range<2> sub(_range.get(1), _range.get(2)); - return accessor(_data + index * sub.size(), sub); - } - - pointer_t get_ptr() const { return _data; } - - private: - pointer_t _data; - sycl::range<3> _range; - }; - template class accessor { - public: - using memory_t = detail::memory_traits; - using element_t = typename memory_t::element_t; - using pointer_t = typename memory_t::pointer_t; - using accessor_t = typename memory_t::template accessor_t<2>; - accessor(pointer_t data, const sycl::range<2> &in_range) - : _data(data), _range(in_range) {} - template - accessor(typename std::enable_if::type &acc) - : accessor(acc, acc.get_range()) {} - accessor(const accessor_t &acc, const sycl::range<2> &in_range) - : accessor(acc.get_pointer(), in_range) {} - - pointer_t operator[](size_t index) const { - return _data + _range.get(1) * index; - } - - pointer_t get_ptr() const { return _data; } - - private: - pointer_t _data; - sycl::range<2> _range; - }; - - namespace detail { - /// Device variable with address space of shared, global or constant. - template class device_memory { - public: - using accessor_t = - typename detail::memory_traits::template accessor_t; - using value_t = typename detail::memory_traits::value_t; - using dpct_accessor_t = dpct::accessor; - - device_memory() : device_memory(sycl::range(1)) {} - - /// Constructor of 1-D array with initializer list - device_memory(const sycl::range &in_range, - std::initializer_list &&init_list) - : device_memory(in_range) { - assert(init_list.size() <= in_range.size()); - _host_ptr = (value_t *)std::malloc(_size); - std::memset(_host_ptr, 0, _size); - std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); - } - - /// Constructor of 2-D array with initializer list - template - device_memory( - const typename std::enable_if>::type &in_range, - std::initializer_list> &&init_list) - : device_memory(in_range) { - assert(init_list.size() <= in_range[0]); - _host_ptr = (value_t *)std::malloc(_size); - std::memset(_host_ptr, 0, _size); - auto tmp_data = _host_ptr; - for (auto sub_list : init_list) { - assert(sub_list.size() <= in_range[1]); - std::memcpy(tmp_data, sub_list.begin(), - sub_list.size() * sizeof(T)); - tmp_data += in_range[1]; - } - } - - /// Constructor with range - device_memory(const sycl::range &range_in) - : _size(range_in.size() * sizeof(T)), _range(range_in), - _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) { - static_assert( - (Memory == global) || (Memory == constant) || (Memory == shared), - "device memory region should be global, constant or shared"); - // Make sure that singleton class mem_mgr and dev_mgr will destruct - // later than this. - detail::mem_mgr::instance(); - dev_mgr::instance(); - } - - /// Constructor with range - template - device_memory(Args... Arguments) - : device_memory(sycl::range(Arguments...)) {} - - ~device_memory() { - if (_device_ptr && !_reference) - dpct::dpct_free(_device_ptr); - if (_host_ptr) - std::free(_host_ptr); - } - - /// Allocate memory with default queue, and init memory if has initial - /// value. - void init() { init(dpct::get_default_queue()); } - /// Allocate memory with specified queue, and init memory if has initial - /// value. - void init(sycl::queue &q) { - if (_device_ptr) - return; - if (!_size) - return; - allocate_device(q); - if (_host_ptr) - detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, - host_to_device); - } - - /// The variable is assigned to a device pointer. - void assign(value_t *src, size_t size) { - this->~device_memory(); - new (this) device_memory(src, size); - } - - /// Get memory pointer of the memory object, which is virtual pointer when - /// usm is not used, and device pointer when usm is used. - value_t *get_ptr() { return get_ptr(get_default_queue()); } - /// Get memory pointer of the memory object, which is virtual pointer when - /// usm is not used, and device pointer when usm is used. - value_t *get_ptr(sycl::queue &q) { - init(q); - return _device_ptr; - } - - /// Get the device memory object size in bytes. - size_t get_size() { return _size; } - - template - typename std::enable_if::type &operator[](size_t index) { - init(); - return _device_ptr[index]; - } - - /// Get dpct::accessor with dimension info for the device memory object - /// when usm is used and dimension is greater than 1. - template - typename std::enable_if::type - get_access([[maybe_unused]] sycl::handler &cgh) { - return dpct_accessor_t((T *)_device_ptr, _range); - } - - private: - device_memory(value_t *memory_ptr, size_t size) - : _size(size), _range(size / sizeof(T)), _reference(true), - _device_ptr(memory_ptr) {} - - void allocate_device(sycl::queue &q) { - #ifndef DPCT_USM_LEVEL_NONE - if (Memory == shared) { - _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(), - q.get_context()); - return; - } - #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY - if (Memory == constant) { - _device_ptr = (value_t *)sycl::malloc_device( - _size, q.get_device(), q.get_context(), - sycl::ext::oneapi::property::usm::device_read_only()); - return; - } - #endif - #endif - _device_ptr = (value_t *)detail::dpct_malloc(_size, q); - } - - size_t _size; - sycl::range _range; - bool _reference; - value_t *_host_ptr; - value_t *_device_ptr; - }; - template - class device_memory : public device_memory { - public: - using base = device_memory; - using value_t = typename base::value_t; - using accessor_t = - typename detail::memory_traits::template accessor_t<0>; - - /// Constructor with initial value. - device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {} - - /// Default constructor - device_memory() : base(1) {} - }; - } // namespace detail - - template - using global_memory = detail::device_memory; - template - using constant_memory = detail::device_memory; - template - using shared_memory = detail::device_memory; - - - template - inline T atomic_fetch_add(T *addr, T operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_add(operand); - } - - template - inline T1 atomic_fetch_add(T1 *addr, T2 operand) { - auto atm = - sycl::atomic_ref(addr[0]); - return atm.fetch_add(operand); - } - - template - inline T atomic_fetch_add(T *addr, T operand, - sycl::memory_order memoryOrder) { - switch (memoryOrder) { - case sycl::memory_order::relaxed: - return atomic_fetch_add(addr, operand); - case sycl::memory_order::acq_rel: - return atomic_fetch_add(addr, operand); - case sycl::memory_order::seq_cst: - return atomic_fetch_add(addr, operand); - default: - assert(false && "Invalid memory_order for atomics. Valid memory_order for " - "atomics are: sycl::memory_order::relaxed, " - "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); - } - } - - template - inline T1 atomic_fetch_add(T1 *addr, T2 operand, - sycl::memory_order memoryOrder) { - atomic_fetch_add(addr, operand, memoryOrder); - } - -} // COPY from DPCT head files - -#endif // GGML_SYCL_DPCT_HELPER_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_DPCT_HELPER_HPP +#define GGML_SYCL_DPCT_HELPER_HPP + +#include +#include +#include +#include + +#ifdef GGML_SYCL_USE_INTEL_ONEMKL +#include +// Allow to use the same namespace for Intel oneMKL and oneMath +namespace oneapi { + namespace math = mkl; +} +#else +#include +#endif + +#include "ggml.h" + +#if defined(__linux__) +#include +#elif defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#else +#error "Only support Windows and Linux." +#endif + +#if defined(__linux__) +#include +#include +#endif +#if defined(_WIN64) +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#endif + +#define DPCT_COMPATIBILITY_TEMP (900) + +#if defined(_MSC_VER) +#define __dpct_align__(n) __declspec(align(n)) +#define __dpct_inline__ __forceinline +#else +#define __dpct_align__(n) __attribute__((aligned(n))) +#define __dpct_inline__ __inline__ __attribute__((always_inline)) +#endif + +#if defined(_MSC_VER) +#define __dpct_noinline__ __declspec(noinline) +#else +#define __dpct_noinline__ __attribute__((noinline)) +#endif + +inline std::string get_device_type_name(const sycl::device &Device) { + auto DeviceType = Device.get_info(); + switch (DeviceType) { + case sycl::info::device_type::cpu: + return "cpu"; + case sycl::info::device_type::gpu: + return "gpu"; + case sycl::info::device_type::host: + return "host"; + case sycl::info::device_type::accelerator: + return "acc"; + default: + return "unknown"; + } +} + +inline std::string get_device_backend_and_type(const sycl::device &device) { + std::stringstream device_type; + sycl::backend backend = device.get_backend(); + device_type << backend << ":" << get_device_type_name(device); + return device_type.str(); +} + +template struct matrix_info_t { + oneapi::math::transpose transpose_info[2]; + Ts value_info[2]; + std::int64_t size_info[3]; + std::int64_t ld_info[3]; + std::int64_t groupsize_info; +}; + +inline auto get_onemath_backend(sycl::queue& queue) +#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL) + -> sycl::queue& +#endif +{ +// If the backend is known at compile-time, use oneMath backend_selector to use +// compile-time dispatching and avoid the need to dlopen libraries. Otherwise +// fallback to runtime dispatching. +#if defined(GGML_SYCL_NVIDIA) + return oneapi::math::backend_selector{ queue }; +#elif defined(GGML_SYCL_AMD) + return oneapi::math::backend_selector{ queue }; +#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL) + return queue; +#else + static_assert(false, "Unsupported backend"); +#endif +} + +namespace dpct +{ + typedef sycl::queue *queue_ptr; + typedef sycl::event *event_ptr; + typedef char *device_ptr; + typedef uint8_t byte_t; + typedef sycl::buffer buffer_t; + + /// SYCL default exception handler + inline auto exception_handler = [](sycl::exception_list exceptions) + { + for (std::exception_ptr const &e : exceptions) + { + try + { + std::rethrow_exception(e); + } + catch (sycl::exception const &e) + { + std::cerr << "Caught asynchronous SYCL exception:" << std::endl + << e.what() << std::endl + << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + } + } + }; + + enum error_code + { + success = 0, + default_error = 999 + }; + + enum memcpy_direction + { + host_to_host, + host_to_device, + device_to_host, + device_to_device, + automatic + }; + + enum memory_region + { + global = 0, // device global memory + constant, // device constant memory + local, // device local memory + shared, // memory which can be accessed by host and device + }; + + enum class library_data_t : unsigned char + { + real_float = 0, + complex_float, + real_double, + complex_double, + real_half, + complex_half, + real_bfloat16, + complex_bfloat16, + real_int4, + complex_int4, + real_uint4, + complex_uint4, + real_int8, + complex_int8, + real_uint8, + complex_uint8, + real_int16, + complex_int16, + real_uint16, + complex_uint16, + real_int32, + complex_int32, + real_uint32, + complex_uint32, + real_int64, + complex_int64, + real_uint64, + complex_uint64, + real_int8_4, + real_int8_32, + real_uint8_4, + library_data_t_size + }; + + template + struct DataType + { + using T2 = T; + }; + template + struct DataType> + { + using T2 = std::complex; + }; + + static void destroy_event(event_ptr event) + { + delete event; + } + + static inline unsigned int get_tid() + { +#if defined(__linux__) + return syscall(SYS_gettid); +#elif defined(_WIN64) + return GetCurrentThreadId(); +#else +#error "Only support Windows and Linux." +#endif + } + + namespace detail + { + static void get_version(const sycl::device &dev, int &major, int &minor) + { + // Version string has the following format: + // a. OpenCL + // b. + // c. e.g gfx1030 + std::string ver; + ver = dev.get_info(); + std::string::size_type i = 0; + while (i < ver.size()) { + if (isdigit(ver[i])) + break; + i++; + } + major = std::stoi(&(ver[i])); + while (i < ver.size()) { + if (ver[i] == '.') + break; + i++; + } + if (i < ver.size()) { + // a. and b. + i++; + minor = std::stoi(&(ver[i])); + } else { + // c. + minor = 0; + } + } + + template + class generic_error_type + { + public: + generic_error_type() = default; + generic_error_type(T value) : value{value} {} + operator T() const { return value; } + + private: + T value; + }; + + } // namespace detail + + // COPY from DPCT head files + /// dim3 is used to store 3 component dimensions. + class dim3 { + public: + unsigned x, y, z; + + constexpr dim3(unsigned x = 1, unsigned y = 1, unsigned z = 1) + : x(x), y(y), z(z) {} + + dim3(const sycl::id<3> &r) : dim3(r[2], r[1], r[0]) {} + + operator sycl::range<3>() const { return sycl::range<3>(z, y, x); } + }; // namespace dim3 + + inline dim3 operator*(const dim3 &a, const dim3 &b) { + return dim3{a.x * b.x, a.y * b.y, a.z * b.z}; + } + // COPY from DPCT head files + + + /// Pitched 2D/3D memory data. + class pitched_data + { + public: + pitched_data() : pitched_data(nullptr, 0, 0, 0) {} + pitched_data(void *data, size_t pitch, size_t x, size_t y) + : _data(data), _pitch(pitch), _x(x), _y(y) {} + + void *get_data_ptr() { return _data; } + void set_data_ptr(void *data) { _data = data; } + + size_t get_pitch() { return _pitch; } + void set_pitch(size_t pitch) { _pitch = pitch; } + + size_t get_x() { return _x; } + void set_x(size_t x) { _x = x; } + + size_t get_y() { return _y; } + void set_y(size_t y) { _y = y; } + + private: + void *_data; + size_t _pitch, _x, _y; + }; + + class device_info + { + public: + // get interface + const char *get_name() const { return _name; } + char *get_name() { return _name; } + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() const + { + if constexpr (std::is_same_v>) + return sycl::range<3>(_max_work_item_sizes_i[0], + _max_work_item_sizes_i[1], + _max_work_item_sizes_i[2]); + else + { + return _max_work_item_sizes_i; + } + } + template , + std::enable_if_t> || + std::is_same_v, + int> = 0> + auto get_max_work_item_sizes() + { + if constexpr (std::is_same_v>) + return sycl::range<3>(_max_work_item_sizes_i[0], + _max_work_item_sizes_i[1], + _max_work_item_sizes_i[2]); + else + { + return _max_work_item_sizes_i; + } + } + bool get_host_unified_memory() const { return _host_unified_memory; } + int get_major_version() const { return _major; } + int get_minor_version() const { return _minor; } + int get_integrated() const { return _integrated; } + int get_max_clock_frequency() const { return _frequency; } + int get_max_compute_units() const { return _max_compute_units; } + int get_max_work_group_size() const { return _max_work_group_size; } + int get_max_sub_group_size() const { return _max_sub_group_size; } + int get_max_work_items_per_compute_unit() const + { + return _max_work_items_per_compute_unit; + } + int get_max_register_size_per_work_group() const + { + return _max_register_size_per_work_group; + } + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size() const + { + if constexpr (std::is_same_v) + return _max_nd_range_size; + else + return _max_nd_range_size_i; + } + template || + std::is_same_v, + int> = 0> + auto get_max_nd_range_size() + { + if constexpr (std::is_same_v) + return _max_nd_range_size; + else + return _max_nd_range_size_i; + } + size_t get_global_mem_size() const { return _global_mem_size; } + size_t get_local_mem_size() const { return _local_mem_size; } + size_t get_max_mem_alloc_size() const { return _max_mem_alloc_size; } + /// Returns the maximum clock rate of device's global memory in kHz. If + /// compiler does not support this API then returns default value 3200000 kHz. + unsigned int get_memory_clock_rate() const { return _memory_clock_rate; } + /// Returns the maximum bus width between device and memory in bits. If + /// compiler does not support this API then returns default value 64 bits. + unsigned int get_memory_bus_width() const { return _memory_bus_width; } + uint32_t get_device_id() const { return _device_id; } + std::array get_uuid() const { return _uuid; } + /// Returns global memory cache size in bytes. + unsigned int get_global_mem_cache_size() const + { + return _global_mem_cache_size; + } + + // set interface + void set_name(const char *name) + { + size_t length = strlen(name); + if (length < 256) + { + std::memcpy(_name, name, length + 1); + } + else + { + std::memcpy(_name, name, 255); + _name[255] = '\0'; + } + } + void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) + { + for (int i = 0; i < 3; ++i) + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + [[deprecated]] void + set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) + { + for (int i = 0; i < 3; ++i) + { + _max_work_item_sizes_i[i] = max_work_item_sizes[i]; + } + } + void set_host_unified_memory(bool host_unified_memory) + { + _host_unified_memory = host_unified_memory; + } + void set_major_version(int major) { _major = major; } + void set_minor_version(int minor) { _minor = minor; } + void set_integrated(int integrated) { _integrated = integrated; } + void set_max_clock_frequency(int frequency) { _frequency = frequency; } + void set_max_compute_units(int max_compute_units) + { + _max_compute_units = max_compute_units; + } + void set_global_mem_size(size_t global_mem_size) + { + _global_mem_size = global_mem_size; + } + void set_local_mem_size(size_t local_mem_size) + { + _local_mem_size = local_mem_size; + } + void set_max_mem_alloc_size(size_t max_mem_alloc_size) + { + _max_mem_alloc_size = max_mem_alloc_size; + } + void set_max_work_group_size(int max_work_group_size) + { + _max_work_group_size = max_work_group_size; + } + void set_max_sub_group_size(int max_sub_group_size) + { + _max_sub_group_size = max_sub_group_size; + } + void + set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) + { + _max_work_items_per_compute_unit = max_work_items_per_compute_unit; + } + void set_max_nd_range_size(int max_nd_range_size[]) + { + for (int i = 0; i < 3; i++) + { + _max_nd_range_size[i] = max_nd_range_size[i]; + _max_nd_range_size_i[i] = max_nd_range_size[i]; + } + } + void set_memory_clock_rate(unsigned int memory_clock_rate) + { + _memory_clock_rate = memory_clock_rate; + } + void set_memory_bus_width(unsigned int memory_bus_width) + { + _memory_bus_width = memory_bus_width; + } + void + set_max_register_size_per_work_group(int max_register_size_per_work_group) + { + _max_register_size_per_work_group = max_register_size_per_work_group; + } + void set_device_id(uint32_t device_id) + { + _device_id = device_id; + } + void set_uuid(std::array uuid) + { + _uuid = std::move(uuid); + } + void set_global_mem_cache_size(unsigned int global_mem_cache_size) + { + _global_mem_cache_size = global_mem_cache_size; + } + + private: + char _name[256]; + int _max_work_item_sizes_i[3]; + bool _host_unified_memory = false; + int _major; + int _minor; + int _integrated = 0; + int _frequency; + // Set estimated value 3200000 kHz as default value. + unsigned int _memory_clock_rate = 3200000; + // Set estimated value 64 bits as default value. + unsigned int _memory_bus_width = 64; + unsigned int _global_mem_cache_size; + int _max_compute_units; + int _max_work_group_size; + int _max_sub_group_size; + int _max_work_items_per_compute_unit; + int _max_register_size_per_work_group; + size_t _global_mem_size; + size_t _local_mem_size; + size_t _max_mem_alloc_size; + size_t _max_nd_range_size[3]; + int _max_nd_range_size_i[3]; + uint32_t _device_id; + std::array _uuid; + }; + + static int get_major_version(const sycl::device &dev) + { + int major, minor; + detail::get_version(dev, major, minor); + return major; + } + + static int get_minor_version(const sycl::device &dev) + { + int major, minor; + detail::get_version(dev, major, minor); + return minor; + } + + static void get_device_info(device_info &out, const sycl::device &dev) + { + device_info prop; + prop.set_name(dev.get_info().c_str()); + + int major, minor; + detail::get_version(dev, major, minor); + prop.set_major_version(major); + prop.set_minor_version(minor); + + prop.set_max_work_item_sizes( +#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902) + // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes + // is an enum class element + dev.get_info()); +#else + // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by + // an int + dev.get_info>()); +#endif + prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations)); + + prop.set_max_clock_frequency( + dev.get_info() * 1000); + + prop.set_max_compute_units( + dev.get_info()); + prop.set_max_work_group_size( + dev.get_info()); + prop.set_global_mem_size(dev.get_info()); + prop.set_local_mem_size(dev.get_info()); + prop.set_max_mem_alloc_size(dev.get_info()); + +#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6) + if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) + { + unsigned int tmp = + dev.get_info(); + if (tmp != 0) + prop.set_memory_clock_rate(1000 * tmp); + } + if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) + { + prop.set_memory_bus_width( + dev.get_info()); + } + if (dev.has(sycl::aspect::ext_intel_device_id)) + { + prop.set_device_id( + dev.get_info()); + } + if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) + { + prop.set_uuid(dev.get_info()); + } +#elif defined(_MSC_VER) && !defined(__clang__) +#pragma message("get_device_info: querying memory_clock_rate and \ + memory_bus_width are not supported by the compiler used. \ + Use 3200000 kHz as memory_clock_rate default value. \ + Use 64 bits as memory_bus_width default value.") +#else +#warning "get_device_info: querying memory_clock_rate and \ + memory_bus_width are not supported by the compiler used. \ + Use 3200000 kHz as memory_clock_rate default value. \ + Use 64 bits as memory_bus_width default value." +#endif + + size_t max_sub_group_size = 1; + std::vector sub_group_sizes = + dev.get_info(); + + for (const auto &sub_group_size : sub_group_sizes) + { + if (max_sub_group_size < sub_group_size) + max_sub_group_size = sub_group_size; + } + + prop.set_max_sub_group_size(max_sub_group_size); + + prop.set_max_work_items_per_compute_unit( + dev.get_info()); + int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; + prop.set_max_nd_range_size(max_nd_range_size); + + // Estimates max register size per work group, feel free to update the value + // according to device properties. + prop.set_max_register_size_per_work_group(65536); + + prop.set_global_mem_cache_size( + dev.get_info()); + out = prop; + } + + /// dpct device extension + class device_ext : public sycl::device { + typedef std::mutex mutex_type; + + public: + device_ext() : sycl::device() {} + ~device_ext() { + std::lock_guard lock(m_mutex); + clear_queues(); + } + device_ext(const sycl::device &base) : sycl::device(base) { + std::lock_guard lock(m_mutex); + init_queues(); + } + + int is_native_atomic_supported() { return 0; } + int get_major_version() const { return dpct::get_major_version(*this); } + + int get_minor_version() const { return dpct::get_minor_version(*this); } + + int get_max_compute_units() const { + return get_device_info().get_max_compute_units(); + } + + /// Return the maximum clock frequency of this device in KHz. + int get_max_clock_frequency() const { + return get_device_info().get_max_clock_frequency(); + } + + int get_integrated() const { return get_device_info().get_integrated(); } + + int get_max_sub_group_size() const { + return get_device_info().get_max_sub_group_size(); + } + + int get_max_register_size_per_work_group() const { + return get_device_info().get_max_register_size_per_work_group(); + } + + int get_max_work_group_size() const { + return get_device_info().get_max_work_group_size(); + } + + int get_mem_base_addr_align() const { + return get_info(); + } + + size_t get_global_mem_size() const { + return get_device_info().get_global_mem_size(); + } + + size_t get_max_mem_alloc_size() const { + return get_device_info().get_max_mem_alloc_size(); + } + + /// Get the number of bytes of free and total memory on the SYCL device. + /// \param [out] free_memory The number of bytes of free memory on the + /// SYCL device. \param [out] total_memory The number of bytes of total + /// memory on the SYCL device. + void get_memory_info(size_t &free_memory, size_t &total_memory) { + total_memory = get_device_info().get_global_mem_size(); + const char *warning_info = + "get_memory_info: [warning] ext_intel_free_memory is not " + "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " + "use total memory as free memory"; +#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) + if (!has(sycl::aspect::ext_intel_free_memory)) { + std::cerr << warning_info << std::endl; + free_memory = total_memory; + } else { + free_memory = get_info(); + } +#else + std::cerr << warning_info << std::endl; + free_memory = total_memory; +#if defined(_MSC_VER) && !defined(__clang__) +#pragma message("Querying the number of bytes of free memory is not supported") +#else +#warning "Querying the number of bytes of free memory is not supported" +#endif +#endif + } + + void get_device_info(device_info &out) const { + dpct::get_device_info(out, *this); + } + + device_info get_device_info() const { + device_info prop; + dpct::get_device_info(prop, *this); + return prop; + } + + void reset() { + std::lock_guard lock(m_mutex); + clear_queues(); + init_queues(); + } + + sycl::queue &in_order_queue() { return _q_in_order; } + + sycl::queue &out_of_order_queue() { return _q_out_of_order; } + + sycl::queue &default_queue() { return in_order_queue(); } + + void queues_wait_and_throw() { + std::unique_lock lock(m_mutex); + lock.unlock(); + for (auto &q : _queues) { + q.wait_and_throw(); + } + // Guard the destruct of current_queues to make sure the ref count is + // safe. + lock.lock(); + } + + sycl::queue create_queue(bool enable_exception_handler = false) { + return create_in_order_queue(enable_exception_handler); + } + + sycl::queue create_queue(sycl::device device, + bool enable_exception_handler = false) { + return create_in_order_queue(device, enable_exception_handler); + } + + sycl::queue create_in_order_queue(bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue create_in_order_queue(sycl::device device, + bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(device, enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue create_out_of_order_queue( + bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler); + } + + void destroy_queue(sycl::queue queue) { + std::lock_guard lock(m_mutex); + _queues.erase(std::remove_if(_queues.begin(), _queues.end(), + [=](const sycl::queue &q) -> bool + { + return q == queue; + }), + _queues.end()); + } + void set_saved_queue(sycl::queue q) { + std::lock_guard lock(m_mutex); + _saved_queue = q; + } + sycl::queue get_saved_queue() const { + std::lock_guard lock(m_mutex); + return _saved_queue; + } + + private: + void clear_queues() { _queues.clear(); } + + void init_queues() { + _q_in_order = + create_queue_impl(true, sycl::property::queue::in_order()); + _q_out_of_order = create_queue_impl(true); + _saved_queue = default_queue(); + } + + /// Caller should acquire resource \p m_mutex before calling this + /// function. + template + sycl::queue create_queue_impl(bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(sycl::queue( + *this, eh, + sycl::property_list( +#ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), +#endif + properties...))); + + return _queues.back(); + } + + template + sycl::queue create_queue_impl(sycl::device device, + bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(sycl::queue( + device, eh, + sycl::property_list( +#ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), +#endif + properties...))); + + return _queues.back(); + } + + void get_version(int &major, int &minor) const { + detail::get_version(*this, major, minor); + } + sycl::queue _q_in_order, _q_out_of_order; + sycl::queue _saved_queue; + std::vector _queues; + mutable mutex_type m_mutex; + }; + + + /// device manager + class dev_mgr + { + public: + device_ext ¤t_device() + { + unsigned int dev_id = current_device_id(); + check_id(dev_id); + return *_devs[dev_id]; + } + device_ext &cpu_device() const + { + std::lock_guard lock(m_mutex); + if (_cpu_device == -1) + { + throw std::runtime_error("no valid cpu device"); + } + else + { + return *_devs[_cpu_device]; + } + } + device_ext &get_device(unsigned int id) const + { + std::lock_guard lock(m_mutex); + check_id(id); + return *_devs[id]; + } + unsigned int current_device_id() const + { + std::lock_guard lock(m_mutex); + auto it = _thread2dev_map.find(get_tid()); + if (it != _thread2dev_map.end()) + return it->second; + return DEFAULT_DEVICE_ID; + } + + /// Select device with a device ID. + /// \param [in] id The id of the device which can + /// be obtained through get_device_id(const sycl::device). + void select_device(unsigned int id) + { + std::lock_guard lock(m_mutex); + check_id(id); + _thread2dev_map[get_tid()] = id; + } + unsigned int device_count() { return _devs.size(); } + + unsigned int get_device_id(const sycl::device &dev) + { + unsigned int id = 0; + for (auto &dev_item : _devs) + { + if (*dev_item == dev) + { + return id; + } + id++; + } + return -1; + } + + inline std::string get_preferred_gpu_platform_name() { + std::string result; + + std::string filter = ""; + char* env = getenv("ONEAPI_DEVICE_SELECTOR"); + if (env) { + if (std::strstr(env, "level_zero")) { + filter = "level-zero"; + } + else if (std::strstr(env, "opencl")) { + filter = "opencl"; + } + else if (std::strstr(env, "cuda")) { + filter = "cuda"; + } + else if (std::strstr(env, "hip")) { + filter = "hip"; + } + else { + throw std::runtime_error("invalid device filter: " + std::string(env)); + } + } else { + auto default_device = sycl::device(sycl::default_selector_v); + auto default_platform_name = default_device.get_platform().get_info(); + + if (std::strstr(default_platform_name.c_str(), "Level-Zero") || default_device.is_cpu()) { + filter = "level-zero"; + } + else if (std::strstr(default_platform_name.c_str(), "CUDA")) { + filter = "cuda"; + } + else if (std::strstr(default_platform_name.c_str(), "HIP")) { + filter = "hip"; + } + } + + auto platform_list = sycl::platform::get_platforms(); + + for (const auto& platform : platform_list) { + auto devices = platform.get_devices(); + auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { + return d.is_gpu(); + }); + + if (gpu_dev == devices.end()) { + // cout << "platform [" << platform_name + // << "] does not contain GPU devices, skipping\n"; + continue; + } + + auto platform_name = platform.get_info(); + std::string platform_name_low_case; + platform_name_low_case.resize(platform_name.size()); + + std::transform( + platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower); + + if (platform_name_low_case.find(filter) == std::string::npos) { + // cout << "platform [" << platform_name + // << "] does not match with requested " + // << filter << ", skipping\n"; + continue; + } + + result = platform_name; + } + + if (result.empty()) + throw std::runtime_error("can not find preferred GPU platform"); + + return result; + } + + template + std::enable_if_t< + std::is_invocable_r_v> + select_device(const DeviceSelector &selector = sycl::gpu_selector_v) + { + sycl::device selected_device = sycl::device(selector); + unsigned int selected_device_id = get_device_id(selected_device); + select_device(selected_device_id); + } + + /// Returns the instance of device manager singleton. + static dev_mgr &instance() + { + static dev_mgr d_m; + return d_m; + } + dev_mgr(const dev_mgr &) = delete; + dev_mgr &operator=(const dev_mgr &) = delete; + dev_mgr(dev_mgr &&) = delete; + dev_mgr &operator=(dev_mgr &&) = delete; + + private: + mutable std::recursive_mutex m_mutex; + static bool compare_dev(sycl::device &device1, sycl::device &device2) + { + sycl::backend backend1 = device1.get_backend(); + sycl::backend backend2 = device2.get_backend(); + // levelzero backends always come first + if(backend1 == sycl::backend::ext_oneapi_level_zero && backend2 != sycl::backend::ext_oneapi_level_zero) return true; + if(backend1 != sycl::backend::ext_oneapi_level_zero && backend2 == sycl::backend::ext_oneapi_level_zero) return false; + dpct::device_info prop1; + dpct::get_device_info(prop1, device1); + dpct::device_info prop2; + dpct::get_device_info(prop2, device2); + return prop1.get_max_compute_units() > prop2.get_max_compute_units(); + } + static int convert_backend_index(std::string & backend) { + if (backend == "ext_oneapi_level_zero:gpu") return 0; + if (backend == "opencl:gpu") return 1; + if (backend == "ext_oneapi_cuda:gpu") return 2; + if (backend == "ext_oneapi_hip:gpu") return 3; + if (backend == "opencl:cpu") return 4; + if (backend == "opencl:acc") return 5; + printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); + GGML_ABORT("fatal error"); + } + static bool compare_backend(std::string &backend1, std::string &backend2) { + return convert_backend_index(backend1) < convert_backend_index(backend2); + } + dev_mgr() + { + sycl::device default_device = + sycl::device(sycl::default_selector_v); + _devs.push_back(std::make_shared(default_device)); + + std::vector sycl_all_devs; + // Collect other devices except for the default device. + if (default_device.is_cpu()) + _cpu_device = 0; + + auto Platforms = sycl::platform::get_platforms(); + // Keep track of the number of devices per backend + std::map DeviceNums; + std::map> backend_devices; + auto preferred_platform_name = get_preferred_gpu_platform_name(); + + while (!Platforms.empty()) { + auto Platform = Platforms.back(); + Platforms.pop_back(); + auto platform_name = Platform.get_info(); + if (platform_name.compare(preferred_platform_name) != 0) { + continue; + } + auto devices = Platform.get_devices(); + std::string backend_type = get_device_backend_and_type(devices[0]); + for (const auto &device : devices) { + backend_devices[backend_type].push_back(device); + } + } + + std::vector keys; + for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) { + keys.push_back(it->first); + } + std::sort(keys.begin(), keys.end(), compare_backend); + + for (auto &key : keys) { + std::vector devs = backend_devices[key]; + std::sort(devs.begin(), devs.end(), compare_dev); + for (const auto &dev : devs) { + sycl_all_devs.push_back(dev); + } + } + + for (auto &dev : sycl_all_devs) + { + if (dev == default_device) + { + continue; + } + _devs.push_back(std::make_shared(dev)); + if (_cpu_device == -1 && dev.is_cpu()) + { + _cpu_device = _devs.size() - 1; + } + } + } + void check_id(unsigned int id) const + { + if (id >= _devs.size()) + { + throw std::runtime_error("invalid device id"); + } + } + std::vector> _devs; + /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current + /// thread id in _thread2dev_map, which means default device should be used + /// for the current thread. + const unsigned int DEFAULT_DEVICE_ID = 0; + /// thread-id to device-id map. + std::map _thread2dev_map; + int _cpu_device = -1; + }; + + static inline sycl::queue &get_default_queue() + { + return dev_mgr::instance().current_device().default_queue(); + } + + namespace detail + { + enum class pointer_access_attribute + { + host_only = 0, + device_only, + host_device, + end + }; + + static pointer_access_attribute get_pointer_attribute(sycl::queue &q, + const void *ptr) + { + switch (sycl::get_pointer_type(ptr, q.get_context())) + { + case sycl::usm::alloc::unknown: + return pointer_access_attribute::host_only; + case sycl::usm::alloc::device: + return pointer_access_attribute::device_only; + case sycl::usm::alloc::shared: + case sycl::usm::alloc::host: + return pointer_access_attribute::host_device; + } + } + + template + inline constexpr std::uint64_t get_type_combination_id(ArgT Val) + { + static_assert((unsigned char)library_data_t::library_data_t_size <= + std::numeric_limits::max() && + "library_data_t size exceeds limit."); + static_assert(std::is_same_v, "Unsupported ArgT"); + return (std::uint64_t)Val; + } + + template + inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal, + RestT... RestVal) + { + static_assert((std::uint8_t)library_data_t::library_data_t_size <= + std::numeric_limits::max() && + "library_data_t size exceeds limit."); + static_assert(sizeof...(RestT) <= 8 && "Too many parameters"); + static_assert(std::is_same_v, "Unsupported FirstT"); + return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal); + } + + class mem_mgr + { + mem_mgr() + { + // Reserved address space, no real memory allocation happens here. +#if defined(__linux__) + mapped_address_space = + (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +#elif defined(_WIN64) + mapped_address_space = (byte_t *)VirtualAlloc( + NULL, // NULL specified as the base address parameter + mapped_region_size, // Size of allocation + MEM_RESERVE, // Allocate reserved pages + PAGE_NOACCESS); // Protection = no access +#else +#error "Only support Windows and Linux." +#endif + next_free = mapped_address_space; + } + + public: + using buffer_id_t = int; + + struct allocation + { + buffer_t buffer; + byte_t *alloc_ptr; + size_t size; + }; + + ~mem_mgr() + { +#if defined(__linux__) + munmap(mapped_address_space, mapped_region_size); +#elif defined(_WIN64) + VirtualFree(mapped_address_space, 0, MEM_RELEASE); +#else +#error "Only support Windows and Linux." +#endif + } + + mem_mgr(const mem_mgr &) = delete; + mem_mgr &operator=(const mem_mgr &) = delete; + mem_mgr(mem_mgr &&) = delete; + mem_mgr &operator=(mem_mgr &&) = delete; + + /// Allocate + void *mem_alloc(size_t size) + { + if (!size) + return nullptr; + std::lock_guard lock(m_mutex); + if (next_free + size > mapped_address_space + mapped_region_size) + { + throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool"); + } + // Allocation + sycl::range<1> r(size); + buffer_t buf(r); + allocation A{buf, next_free, size}; + // Map allocation to device pointer + void *result = next_free; + m_map.emplace(next_free + size, A); + // Update pointer to the next free space. + next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1); + + return result; + } + + /// Deallocate + void mem_free(const void *ptr) + { + if (!ptr) + return; + std::lock_guard lock(m_mutex); + auto it = get_map_iterator(ptr); + m_map.erase(it); + } + + /// map: device pointer -> allocation(buffer, alloc_ptr, size) + allocation translate_ptr(const void *ptr) + { + std::lock_guard lock(m_mutex); + auto it = get_map_iterator(ptr); + return it->second; + } + + /// Check if the pointer represents device pointer or not. + bool is_device_ptr(const void *ptr) const + { + std::lock_guard lock(m_mutex); + return (mapped_address_space <= ptr) && + (ptr < mapped_address_space + mapped_region_size); + } + + /// Returns the instance of memory manager singleton. + static mem_mgr &instance() + { + static mem_mgr m; + return m; + } + + private: + std::map m_map; + mutable std::mutex m_mutex; + byte_t *mapped_address_space; + byte_t *next_free; + const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024; + const size_t alignment = 256; + /// This padding may be defined to some positive value to debug + /// out of bound accesses. + const size_t extra_padding = 0; + + std::map::iterator get_map_iterator(const void *ptr) + { + auto it = m_map.upper_bound(const_cast(reinterpret_cast(ptr))); + if (it == m_map.end()) + { + // Not a virtual pointer. + throw std::runtime_error("can not get buffer from non-virtual pointer"); + } + const allocation &alloc = it->second; + if (ptr < alloc.alloc_ptr) + { + // Out of bound. + // This may happen if there's a gap between allocations due to alignment + // or extra padding and pointer points to this gap. + throw std::runtime_error("invalid virtual pointer"); + } + return it; + } + }; + + template + class accessor; + template + class memory_traits + { + public: + static constexpr sycl::access::target target = + sycl::access::target::device; + static constexpr sycl::access_mode mode = + (Memory == constant) ? sycl::access_mode::read + : sycl::access_mode::read_write; + static constexpr size_t type_size = sizeof(T); + using element_t = + typename std::conditional::type; + using value_t = typename std::remove_cv::type; + template + using accessor_t = typename std::conditional< + Memory == local, sycl::local_accessor, + sycl::accessor>::type; + using pointer_t = T *; + }; + + static inline void *dpct_malloc(size_t size, sycl::queue &q) + { + return sycl::malloc_device(size, q.get_device(), q.get_context()); + } + +#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) + static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z, + sycl::queue &q) + { + pitch = PITCH_DEFAULT_ALIGN(x); + return dpct_malloc(pitch * y * z, q); + } + + /** + * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q. + * @tparam valueT The type of the element to be set. + * @param [in] q The queue in which the operation is done. + * @param [in] dev_ptr Pointer to the virtual device memory address. + * @param [in] value The value to be set. + * @param [in] size Number of elements to be set to the value. + * @return An event representing the memset operation. + */ + template + static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, + valueT value, size_t size) + { + return q.fill(dev_ptr, value, size); + } + + /** + * @brief Sets \p value to the 3D memory region pointed by \p data in \p q. + * @tparam valueT The type of the element to be set. + * @param [in] q The queue in which the operation is done. + * @param [in] data Pointer to the pitched device memory region. + * @param [in] value The value to be set. + * @param [in] size 3D memory region by number of elements. + * @return An event list representing the memset operations. + */ + template + static inline std::vector + dpct_memset(sycl::queue &q, pitched_data data, valueT value, + sycl::range<3> size) + { + std::vector event_list; + size_t slice = data.get_pitch() * data.get_y(); + unsigned char *data_surface = (unsigned char *)data.get_data_ptr(); + for (size_t z = 0; z < size.get(2); ++z) + { + unsigned char *data_ptr = data_surface; + for (size_t y = 0; y < size.get(1); ++y) + { + event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0))); + data_ptr += data.get_pitch(); + } + data_surface += slice; + } + return event_list; + } + + /** + * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q. + * @tparam valueT The type of the element to be set. + * @param [in] q The queue in which the operation is done. + * @param [in] ptr Pointer to the virtual device memory. + * @param [in] pitch The pitch size by number of elements, including padding. + * @param [in] val The value to be set. + * @param [in] x The width of memory region by number of elements. + * @param [in] y The height of memory region by number of elements. + * @return An event list representing the memset operations. + */ + template + static inline std::vector + dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x, + size_t y) + { + return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val, + sycl::range<3>(x, y, 1)); + } + + static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr, + const void *from_ptr, + memcpy_direction dir) + { + switch (dir) + { + case memcpy_direction::host_to_host: + case memcpy_direction::host_to_device: + case memcpy_direction::device_to_host: + case memcpy_direction::device_to_device: + return dir; + case memcpy_direction::automatic: + { + // table[to_attribute][from_attribute] + static const memcpy_direction + direction_table[static_cast(pointer_access_attribute::end)] + [static_cast(pointer_access_attribute::end)] = + {{memcpy_direction::host_to_host, + memcpy_direction::device_to_host, + memcpy_direction::host_to_host}, + {memcpy_direction::host_to_device, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}, + {memcpy_direction::host_to_host, + memcpy_direction::device_to_device, + memcpy_direction::device_to_device}}; + return direction_table[static_cast(get_pointer_attribute( + q, to_ptr))][static_cast(get_pointer_attribute(q, from_ptr))]; + } + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + } + + static sycl::event + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction, + const std::vector &dep_events = {}) + { + if (!size) + return sycl::event{}; + return q.memcpy(to_ptr, from_ptr, size, dep_events); + GGML_UNUSED(direction); + } + + // Get actual copy range and make sure it will not exceed range. + static inline size_t get_copy_range(sycl::range<3> size, size_t slice, + size_t pitch) + { + return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); + } + + static inline size_t get_offset(sycl::id<3> id, size_t slice, + size_t pitch) + { + return slice * id.get(2) + pitch * id.get(1) + id.get(0); + } + + /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr + /// and \p from_range to another specified by \p to_ptr and \p to_range. + static inline std::vector + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + sycl::range<3> to_range, sycl::range<3> from_range, + sycl::id<3> to_id, sycl::id<3> from_id, + sycl::range<3> size, memcpy_direction direction, + const std::vector &dep_events = {}) + { + // RAII for host pointer + class host_buffer + { + void *_buf; + size_t _size; + sycl::queue &_q; + const std::vector &_deps; // free operation depends + + public: + host_buffer(size_t size, sycl::queue &q, + const std::vector &deps) + : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} + void *get_ptr() const { return _buf; } + size_t get_size() const { return _size; } + ~host_buffer() + { + if (_buf) + { + _q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(_deps); + cgh.host_task([buf = _buf] { std::free(buf); }); }); + } + } + }; + std::vector event_list; + + size_t to_slice = to_range.get(1) * to_range.get(0), + from_slice = from_range.get(1) * from_range.get(0); + unsigned char *to_surface = + (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); + const unsigned char *from_surface = + (const unsigned char *)from_ptr + + get_offset(from_id, from_slice, from_range.get(0)); + + if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) + { + return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), + direction, dep_events)}; + } + direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); + size_t size_slice = size.get(1) * size.get(0); + switch (direction) + { + case host_to_host: + for (size_t z = 0; z < size.get(2); ++z) + { + unsigned char *to_ptr = to_surface; + const unsigned char *from_ptr = from_surface; + if (to_range.get(0) == from_range.get(0) && + to_range.get(0) == size.get(0)) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, + direction, dep_events)); + } + else + { + for (size_t y = 0; y < size.get(1); ++y) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), + direction, dep_events)); + to_ptr += to_range.get(0); + from_ptr += from_range.get(0); + } + } + to_surface += to_slice; + from_surface += from_slice; + } + break; + case host_to_device: + { + host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, + event_list); + std::vector host_events; + if (to_slice == size_slice) + { + // Copy host data to a temp host buffer with the shape of target. + host_events = + dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, + host_to_host, dep_events); + } + else + { + // Copy host data to a temp host buffer with the shape of target. + host_events = dpct_memcpy( + q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, + // If has padding data, not sure whether it is useless. So fill temp + // buffer with it. + std::vector{ + dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), + device_to_host, dep_events)}); + } + // Copy from temp host buffer to device with only one submit. + event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), + buf.get_size(), host_to_device, + host_events)); + break; + } + case device_to_host: + { + host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, + event_list); + // Copy from host temp buffer to host target with reshaping. + event_list = dpct_memcpy( + q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), + sycl::id<3>(0, 0, 0), size, host_to_host, + // Copy from device to temp host buffer with only one submit. + std::vector{dpct_memcpy(q, buf.get_ptr(), from_surface, + buf.get_size(), + device_to_host, dep_events)}); + break; + } + case device_to_device: + event_list.push_back(q.submit([&](sycl::handler &cgh){ + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + return event_list; + } + + /// memcpy 2D/3D matrix specified by pitched_data. + static inline std::vector + dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, + pitched_data from, sycl::id<3> from_id, sycl::range<3> size, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), + sycl::range<3>(to.get_pitch(), to.get_y(), 1), + sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, + size, direction); + } + + /// memcpy 2D matrix with pitch. + static inline std::vector + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + size_t to_pitch, size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), + sycl::range<3>(from_pitch, y, 1), + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), + sycl::range<3>(x, y, 1), direction); + } + + namespace deprecated + { + + template + class usm_allocator + { + private: + using Alloc = sycl::usm_allocator; + Alloc _impl; + + public: + using value_type = typename std::allocator_traits::value_type; + using pointer = typename std::allocator_traits::pointer; + using const_pointer = typename std::allocator_traits::const_pointer; + using void_pointer = typename std::allocator_traits::void_pointer; + using const_void_pointer = + typename std::allocator_traits::const_void_pointer; + using reference = typename std::allocator_traits::value_type &; + using const_reference = + const typename std::allocator_traits::value_type &; + using difference_type = + typename std::allocator_traits::difference_type; + using size_type = typename std::allocator_traits::size_type; + using propagate_on_container_copy_assignment = typename std::allocator_traits< + Alloc>::propagate_on_container_copy_assignment; + using propagate_on_container_move_assignment = typename std::allocator_traits< + Alloc>::propagate_on_container_move_assignment; + using propagate_on_container_swap = + typename std::allocator_traits::propagate_on_container_swap; + using is_always_equal = + typename std::allocator_traits::is_always_equal; + + template + struct rebind + { + typedef usm_allocator other; + }; + + usm_allocator() : _impl(dpct::get_default_queue()) {} + ~usm_allocator() {} + usm_allocator(const usm_allocator &other) : _impl(other._impl) {} + usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {} + pointer address(reference r) { return &r; } + const_pointer address(const_reference r) { return &r; } + pointer allocate(size_type cnt, const_void_pointer hint = nullptr) + { + return std::allocator_traits::allocate(_impl, cnt, hint); + } + void deallocate(pointer p, size_type cnt) + { + std::allocator_traits::deallocate(_impl, p, cnt); + } + size_type max_size() const + { + return std::allocator_traits::max_size(_impl); + } + bool operator==(const usm_allocator &other) const { return _impl == other._impl; } + bool operator!=(const usm_allocator &other) const { return _impl != other._impl; } + }; + + } // namespace deprecated + + inline void dpct_free(void *ptr, + const sycl::queue &q) + { + if (ptr) + { + sycl::free(ptr, q.get_context()); + } + } + + template + inline auto get_memory(const void *x) + { + T *new_x = reinterpret_cast(const_cast(x)); + return new_x; + } + + template + inline typename DataType::T2 get_value(const T *s, sycl::queue &q) + { + using Ty = typename DataType::T2; + Ty s_h; + if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only) + detail::dpct_memcpy(q, (void *)&s_h, (const void *)s, sizeof(T), device_to_host) + .wait(); + else + s_h = *reinterpret_cast(s); + return s_h; + } + + } // namespace detail + + template + inline auto get_value(const T *s, sycl::queue &q) + { + return detail::get_value(s, q); + } + + namespace detail + { + template + inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, + int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb, + const void * beta, void * c, int ldc) { + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + auto data_a = get_memory(a); + auto data_b = get_memory(b); + auto data_c = get_memory(c); + oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, + lda, data_b, ldb, beta_value, data_c, ldc); + } + + template + class vectorized_binary + { + public: + inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) + { + VecT v4; + for (size_t i = 0; i < v4.size(); ++i) + { + v4[i] = binary_op(a[i], b[i]); + } + return v4; + } + }; + + template + class vectorized_binary< + VecT, BinaryOperation, + std::void_t>> + { + public: + inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) + { + return binary_op(a, b).template as(); + } + }; + + template + inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, + int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b, + int ldb, const void * beta, void ** c, int ldc, int batch_size, + matrix_info_t * matrix_info) { + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + + matrix_info->transpose_info[0] = a_trans; + matrix_info->transpose_info[1] = b_trans; + matrix_info->value_info[0] = alpha_value; + matrix_info->value_info[1] = beta_value; + matrix_info->size_info[0] = m; + matrix_info->size_info[1] = n; + matrix_info->size_info[2] = k; + matrix_info->ld_info[0] = lda; + matrix_info->ld_info[1] = ldb; + matrix_info->ld_info[2] = ldc; + matrix_info->groupsize_info = batch_size; + + sycl::event e = oneapi::math::blas::column_major::gemm_batch( + get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1, + matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2, + reinterpret_cast(matrix_info->value_info), reinterpret_cast(a), matrix_info->ld_info, + reinterpret_cast(b), matrix_info->ld_info + 1, + reinterpret_cast(matrix_info->value_info + 1), reinterpret_cast(c), + matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info)); + } + + template + inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, + int m, int n, int k, const void * alpha, const void * a, int lda, + long long int stride_a, const void * b, int ldb, long long int stride_b, + const void * beta, void * c, int ldc, long long int stride_c, int batch_size) { + Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); + Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); + auto data_a = get_memory(a); + auto data_b = get_memory(b); + auto data_c = get_memory(c); + oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, + data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, + data_c, ldc, stride_c, batch_size); + } + + } // namespace detail + + template + inline unsigned vectorized_binary(unsigned a, unsigned b, + const BinaryOperation binary_op) + { + sycl::vec v0{a}, v1{b}; + auto v2 = v0.as(); + auto v3 = v1.as(); + auto v4 = + detail::vectorized_binary()(v2, v3, binary_op); + v0 = v4.template as>(); + return v0; + } + + static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction = automatic, + sycl::queue &q = dpct::get_default_queue()) + { + detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction); + } + + static inline unsigned int select_device(unsigned int id) + { + dev_mgr::instance().select_device(id); + return id; + } + + template + T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask, + unsigned int logical_sub_group_size = 32) + { + unsigned int id = g.get_local_linear_id(); + unsigned int start_index = + id / logical_sub_group_size * logical_sub_group_size; + unsigned int target_offset = (id % logical_sub_group_size) ^ mask; + return sycl::select_from_group(g, x, + target_offset < logical_sub_group_size + ? start_index + target_offset + : id); + } + + template + inline auto dp4a(T1 a, T2 b, T3 c) + { + return syclcompat::dp4a(a, b, c); + } + + struct sub_sat + { + template + auto operator()(const T x, const T y) const + { + return sycl::sub_sat(x, y); + } + }; + + template + inline T vectorized_min(T a, T b) + { + sycl::vec v0{a}, v1{b}; + auto v2 = v0.template as(); + auto v3 = v1.template as(); + auto v4 = sycl::min(v2, v3); + v0 = v4.template as>(); + return v0; + } + + inline float pow(const float a, const int b) { return sycl::pown(a, b); } + inline double pow(const double a, const int b) { return sycl::pown(a, b); } + inline float pow(const float a, const float b) { return sycl::pow(a, b); } + inline double pow(const double a, const double b) { return sycl::pow(a, b); } + template + inline typename std::enable_if_t, T> + pow(const T a, const U b) + { + return sycl::pow(a, static_cast(b)); + } + template + inline typename std::enable_if_t, double> + pow(const T a, const U b) + { + return sycl::pow(static_cast(a), static_cast(b)); + } + + inline double min(const double a, const float b) + { + return sycl::fmin(a, static_cast(b)); + } + inline double min(const float a, const double b) + { + return sycl::fmin(static_cast(a), b); + } + inline float min(const float a, const float b) { return sycl::fmin(a, b); } + inline double min(const double a, const double b) { return sycl::fmin(a, b); } + inline std::uint32_t min(const std::uint32_t a, const std::int32_t b) + { + return sycl::min(a, static_cast(b)); + } + inline std::uint32_t min(const std::int32_t a, const std::uint32_t b) + { + return sycl::min(static_cast(a), b); + } + inline std::int32_t min(const std::int32_t a, const std::int32_t b) + { + return sycl::min(a, b); + } + inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b) + { + return sycl::min(a, b); + } + inline std::uint64_t min(const std::uint64_t a, const std::int64_t b) + { + return sycl::min(a, static_cast(b)); + } + inline std::uint64_t min(const std::int64_t a, const std::uint64_t b) + { + return sycl::min(static_cast(a), b); + } + inline std::int64_t min(const std::int64_t a, const std::int64_t b) + { + return sycl::min(a, b); + } + inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b) + { + return sycl::min(a, b); + } + inline std::uint64_t min(const std::uint64_t a, const std::int32_t b) + { + return sycl::min(a, static_cast(b)); + } + inline std::uint64_t min(const std::int32_t a, const std::uint64_t b) + { + return sycl::min(static_cast(a), b); + } + inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b) + { + return sycl::min(a, static_cast(b)); + } + inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b) + { + return sycl::min(static_cast(a), b); + } + // max function overloads. + // For floating-point types, `float` or `double` arguments are acceptable. + // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or + // `std::int64_t` type arguments are acceptable. + inline double max(const double a, const float b) + { + return sycl::fmax(a, static_cast(b)); + } + inline double max(const float a, const double b) + { + return sycl::fmax(static_cast(a), b); + } + inline float max(const float a, const float b) { return sycl::fmax(a, b); } + inline double max(const double a, const double b) { return sycl::fmax(a, b); } + inline std::uint32_t max(const std::uint32_t a, const std::int32_t b) + { + return sycl::max(a, static_cast(b)); + } + inline std::uint32_t max(const std::int32_t a, const std::uint32_t b) + { + return sycl::max(static_cast(a), b); + } + inline std::int32_t max(const std::int32_t a, const std::int32_t b) + { + return sycl::max(a, b); + } + inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b) + { + return sycl::max(a, b); + } + inline std::uint64_t max(const std::uint64_t a, const std::int64_t b) + { + return sycl::max(a, static_cast(b)); + } + inline std::uint64_t max(const std::int64_t a, const std::uint64_t b) + { + return sycl::max(static_cast(a), b); + } + inline std::int64_t max(const std::int64_t a, const std::int64_t b) + { + return sycl::max(a, b); + } + inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b) + { + return sycl::max(a, b); + } + inline std::uint64_t max(const std::uint64_t a, const std::int32_t b) + { + return sycl::max(a, static_cast(b)); + } + inline std::uint64_t max(const std::int32_t a, const std::uint64_t b) + { + return sycl::max(static_cast(a), b); + } + inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b) + { + return sycl::max(a, static_cast(b)); + } + inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b) + { + return sycl::max(static_cast(a), b); + } + + inline void + has_capability_or_fail(const sycl::device &dev, + const std::initializer_list &props) + { + for (const auto &it : props) + { + if (dev.has(it)) + continue; + switch (it) + { + case sycl::aspect::fp64: + throw std::runtime_error("'double' is not supported in '" + + dev.get_info() + + "' device"); + break; + case sycl::aspect::fp16: + throw std::runtime_error("'half' is not supported in '" + + dev.get_info() + + "' device"); + break; + default: +#define __SYCL_ASPECT(ASPECT, ID) \ + case sycl::aspect::ASPECT: \ + return #ASPECT; +#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID) +#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE) + auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string + { + switch (AspectNum) + { +#include +#include + default: + return "unknown aspect"; + } + }; +#undef __SYCL_ASPECT_DEPRECATED_ALIAS +#undef __SYCL_ASPECT_DEPRECATED +#undef __SYCL_ASPECT + throw std::runtime_error( + "'" + getAspectNameStr(it) + "' is not supported in '" + + dev.get_info() + "' device"); + } + break; + } + } + + static inline unsigned int get_current_device_id() + { + return dev_mgr::instance().current_device_id(); + } + + static inline device_ext &get_current_device() + { + return dev_mgr::instance().current_device(); + } + + static inline device_ext &get_device(unsigned int id) + { + return dev_mgr::instance().get_device(id); + } + + static inline sycl::queue &get_in_order_queue() + { + return dev_mgr::instance().current_device().in_order_queue(); + } + + static sycl::event + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size, + memcpy_direction direction, + const std::vector &dep_events = {}) + { + if (!size) + return sycl::event{}; + return q.memcpy(to_ptr, from_ptr, size, dep_events); + GGML_UNUSED(direction); + } + + // Get actual copy range and make sure it will not exceed range. + static inline size_t get_copy_range(sycl::range<3> size, size_t slice, + size_t pitch) + { + return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0); + } + + static inline size_t get_offset(sycl::id<3> id, size_t slice, + size_t pitch) + { + return slice * id.get(2) + pitch * id.get(1) + id.get(0); + } + + /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr + /// and \p from_range to another specified by \p to_ptr and \p to_range. + static inline std::vector + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + sycl::range<3> to_range, sycl::range<3> from_range, + sycl::id<3> to_id, sycl::id<3> from_id, + sycl::range<3> size, memcpy_direction direction, + const std::vector &dep_events = {}) + { + // RAII for host pointer + class host_buffer + { + void *_buf; + size_t _size; + sycl::queue &_q; + const std::vector &_deps; // free operation depends + + public: + host_buffer(size_t size, sycl::queue &q, + const std::vector &deps) + : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {} + void *get_ptr() const { return _buf; } + size_t get_size() const { return _size; } + ~host_buffer() + { + if (_buf) + { + _q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(_deps); + cgh.host_task([buf = _buf] { std::free(buf); }); }); + } + } + }; + std::vector event_list; + + size_t to_slice = to_range.get(1) * to_range.get(0), + from_slice = from_range.get(1) * from_range.get(0); + unsigned char *to_surface = + (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0)); + const unsigned char *from_surface = + (const unsigned char *)from_ptr + + get_offset(from_id, from_slice, from_range.get(0)); + + if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) + { + return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2), + direction, dep_events)}; + } + direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction); + size_t size_slice = size.get(1) * size.get(0); + switch (direction) + { + case host_to_host: + for (size_t z = 0; z < size.get(2); ++z) + { + unsigned char *to_ptr = to_surface; + const unsigned char *from_ptr = from_surface; + if (to_range.get(0) == from_range.get(0) && + to_range.get(0) == size.get(0)) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice, + direction, dep_events)); + } + else + { + for (size_t y = 0; y < size.get(1); ++y) + { + event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0), + direction, dep_events)); + to_ptr += to_range.get(0); + from_ptr += from_range.get(0); + } + } + to_surface += to_slice; + from_surface += from_slice; + } + break; + case host_to_device: + { + host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q, + event_list); + std::vector host_events; + if (to_slice == size_slice) + { + // Copy host data to a temp host buffer with the shape of target. + host_events = + dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, + host_to_host, dep_events); + } + else + { + // Copy host data to a temp host buffer with the shape of target. + host_events = dpct_memcpy( + q, buf.get_ptr(), from_surface, to_range, from_range, + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host, + // If has padding data, not sure whether it is useless. So fill temp + // buffer with it. + std::vector{ + dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(), + device_to_host, dep_events)}); + } + // Copy from temp host buffer to device with only one submit. + event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(), + buf.get_size(), host_to_device, + host_events)); + break; + } + case device_to_host: + { + host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q, + event_list); + // Copy from host temp buffer to host target with reshaping. + event_list = dpct_memcpy( + q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0), + sycl::id<3>(0, 0, 0), size, host_to_host, + // Copy from device to temp host buffer with only one submit. + std::vector{dpct_memcpy(q, buf.get_ptr(), from_surface, + buf.get_size(), + device_to_host, dep_events)}); + break; + } + case device_to_device: + event_list.push_back(q.submit([&](sycl::handler &cgh) + { + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; + default: + throw std::runtime_error("dpct_memcpy: invalid direction value"); + } + return event_list; + } + + /// memcpy 2D/3D matrix specified by pitched_data. + static inline std::vector + dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id, + pitched_data from, sycl::id<3> from_id, sycl::range<3> size, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(), + sycl::range<3>(to.get_pitch(), to.get_y(), 1), + sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id, + size, direction); + } + + /// memcpy 2D matrix with pitch. + static inline std::vector + dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, + size_t to_pitch, size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic) + { + return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1), + sycl::range<3>(from_pitch, y, 1), + sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), + sycl::range<3>(x, y, 1), direction); + } + + inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n, + int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b, + library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc, + library_data_t scaling_type) { + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) + { + scaling_type = library_data_t::complex_float; + } + else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) + { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) + { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): + { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): + { + detail::gemm_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): + { + detail::gemm_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): + { + detail::gemm_impl(q, a_trans, b_trans, m, n, k, alpha, a, + lda, b, ldb, beta, c, ldc); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): + { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_impl(q, a_trans, b_trans, m, n, k, &alpha_half, + a, lda, b, ldb, &beta_half, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): + { + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): + { + float alpha_float = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_float = + dpct::get_value(reinterpret_cast(beta), q); + detail::gemm_impl( + q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); + break; + } +#endif // __INTEL_MKL__ + default: + throw std::runtime_error("the combination of data type is unsupported"); + } + } // gemm() + + /// Computes a batch of matrix-matrix product with general matrices. + /// \param [in] q The queue where the routine should be executed. + /// \param [in] a_trans Specifies the operation applied to A. + /// \param [in] b_trans Specifies the operation applied to B. + /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. + /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. + /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). + /// \param [in] alpha Scaling factor for the matrix-matrix product. + /// \param [in] a Input matrix A. + /// \param [in] a_type Data type of the matrix A. + /// \param [in] lda Leading dimension of A. + /// \param [in] b Input matrix B. + /// \param [in] b_type Data type of the matrix B. + /// \param [in] ldb Leading dimension of B. + /// \param [in] beta Scaling factor for matrix C. + /// \param [in, out] c Input/Output matrix C. + /// \param [in] c_type Data type of the matrix C. + /// \param [in] ldc Leading dimension of C. + /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. + /// \param [in] scaling_type Data type of the scaling factors. + inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, + int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda, + const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[], + library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type, + matrix_info_t * matrix_info) { + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) + { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, batch_size, matrix_info); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): + { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, + beta, c, ldc, batch_size, matrix_info); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); + break; + } +#endif + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): + { + float alpha_float = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_float = + dpct::get_value(reinterpret_cast(beta), q); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size, + matrix_info); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): + { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } + } + + /// Computes a batch of matrix-matrix product with general matrices. + /// \param [in] q The queue where the routine should be executed. + /// \param [in] a_trans Specifies the operation applied to A. + /// \param [in] b_trans Specifies the operation applied to B. + /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C. + /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C. + /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B). + /// \param [in] alpha Scaling factor for the matrix-matrix product. + /// \param [in] a Input matrix A. + /// \param [in] a_type Data type of the matrix A. + /// \param [in] lda Leading dimension of A. + /// \param [in] stride_a Stride between the different A matrices. + /// \param [in] b Input matrix B. + /// \param [in] b_type Data type of the matrix B. + /// \param [in] ldb Leading dimension of B. + /// \param [in] stride_b Stride between the different B matrices. + /// \param [in] beta Scaling factor for matrix C. + /// \param [in, out] c Input/Output matrix C. + /// \param [in] c_type Data type of the matrix C. + /// \param [in] ldc Leading dimension of C. + /// \param [in] stride_c Stride between the different C matrices. + /// \param [in] batch_size Specifies the number of matrix multiply operations to perform. + /// \param [in] scaling_type Data type of the scaling factors. + inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, + int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda, + long long int stride_a, const void * b, library_data_t b_type, int ldb, + long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc, + long long int stride_c, int batch_size, library_data_t scaling_type) { + if (scaling_type == library_data_t::real_float && + c_type == library_data_t::complex_float) + { + scaling_type = library_data_t::complex_float; + } + else if (scaling_type == library_data_t::real_double && + c_type == library_data_t::complex_double) + { + scaling_type = library_data_t::complex_double; + } + + std::uint64_t key = + detail::get_type_combination_id(a_type, b_type, c_type, scaling_type); + switch (key) + { + case detail::get_type_combination_id( + library_data_t::real_float, library_data_t::real_float, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_double, library_data_t::real_double, + library_data_t::real_double, library_data_t::real_double): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_float, library_data_t::complex_float, + library_data_t::complex_float, library_data_t::complex_float): + { + detail::gemm_batch_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::complex_double, library_data_t::complex_double, + library_data_t::complex_double, library_data_t::complex_double): + { + detail::gemm_batch_impl, std::complex, + std::complex, std::complex>( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_half): + { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, + a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } +#ifdef __INTEL_MKL__ + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_bfloat16, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_bfloat16, library_data_t::real_bfloat16, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, + batch_size); + break; + } +#endif + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_int32, library_data_t::real_int32): + { + detail::gemm_batch_impl(q, a_trans, b_trans, m, n, k, alpha, + a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_int8, library_data_t::real_int8, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_float, library_data_t::real_float): + { + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, + beta, c, ldc, stride_c, batch_size); + break; + } + case detail::get_type_combination_id( + library_data_t::real_half, library_data_t::real_half, + library_data_t::real_half, library_data_t::real_float): + { + float alpha_value = + dpct::get_value(reinterpret_cast(alpha), q); + float beta_value = + dpct::get_value(reinterpret_cast(beta), q); + sycl::half alpha_half(alpha_value); + sycl::half beta_half(beta_value); + detail::gemm_batch_impl( + q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b, + &beta_half, c, ldc, stride_c, batch_size); + break; + } + default: + throw std::runtime_error("the combination of data type is unsupported"); + } + } + + static inline void + async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr, + size_t from_pitch, size_t x, size_t y, + memcpy_direction direction = automatic, + sycl::queue &q = get_default_queue()) + { + detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y, + direction); + } + + using err0 = detail::generic_error_type; + using err1 = detail::generic_error_type; + + static inline void dpct_free(void *ptr, sycl::queue &q = get_default_queue()) { + detail::dpct_free(ptr, q); + } + + /// dpct accessor used as device function parameter. + template class accessor; + template class accessor { + public: + using memory_t = detail::memory_traits; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<3>; + accessor(pointer_t data, const sycl::range<3> &in_range) + : _data(data), _range(in_range) {} + template + accessor(typename std::enable_if::type &acc) + : accessor(acc, acc.get_range()) {} + accessor(const accessor_t &acc, const sycl::range<3> &in_range) + : accessor(acc.get_pointer(), in_range) {} + accessor operator[](size_t index) const { + sycl::range<2> sub(_range.get(1), _range.get(2)); + return accessor(_data + index * sub.size(), sub); + } + + pointer_t get_ptr() const { return _data; } + + private: + pointer_t _data; + sycl::range<3> _range; + }; + template class accessor { + public: + using memory_t = detail::memory_traits; + using element_t = typename memory_t::element_t; + using pointer_t = typename memory_t::pointer_t; + using accessor_t = typename memory_t::template accessor_t<2>; + accessor(pointer_t data, const sycl::range<2> &in_range) + : _data(data), _range(in_range) {} + template + accessor(typename std::enable_if::type &acc) + : accessor(acc, acc.get_range()) {} + accessor(const accessor_t &acc, const sycl::range<2> &in_range) + : accessor(acc.get_pointer(), in_range) {} + + pointer_t operator[](size_t index) const { + return _data + _range.get(1) * index; + } + + pointer_t get_ptr() const { return _data; } + + private: + pointer_t _data; + sycl::range<2> _range; + }; + + namespace detail { + /// Device variable with address space of shared, global or constant. + template class device_memory { + public: + using accessor_t = + typename detail::memory_traits::template accessor_t; + using value_t = typename detail::memory_traits::value_t; + using dpct_accessor_t = dpct::accessor; + + device_memory() : device_memory(sycl::range(1)) {} + + /// Constructor of 1-D array with initializer list + device_memory(const sycl::range &in_range, + std::initializer_list &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range.size()); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T)); + } + + /// Constructor of 2-D array with initializer list + template + device_memory( + const typename std::enable_if>::type &in_range, + std::initializer_list> &&init_list) + : device_memory(in_range) { + assert(init_list.size() <= in_range[0]); + _host_ptr = (value_t *)std::malloc(_size); + std::memset(_host_ptr, 0, _size); + auto tmp_data = _host_ptr; + for (auto sub_list : init_list) { + assert(sub_list.size() <= in_range[1]); + std::memcpy(tmp_data, sub_list.begin(), + sub_list.size() * sizeof(T)); + tmp_data += in_range[1]; + } + } + + /// Constructor with range + device_memory(const sycl::range &range_in) + : _size(range_in.size() * sizeof(T)), _range(range_in), + _reference(false), _host_ptr(nullptr), _device_ptr(nullptr) { + static_assert( + (Memory == global) || (Memory == constant) || (Memory == shared), + "device memory region should be global, constant or shared"); + // Make sure that singleton class mem_mgr and dev_mgr will destruct + // later than this. + detail::mem_mgr::instance(); + dev_mgr::instance(); + } + + /// Constructor with range + template + device_memory(Args... Arguments) + : device_memory(sycl::range(Arguments...)) {} + + ~device_memory() { + if (_device_ptr && !_reference) + dpct::dpct_free(_device_ptr); + if (_host_ptr) + std::free(_host_ptr); + } + + /// Allocate memory with default queue, and init memory if has initial + /// value. + void init() { init(dpct::get_default_queue()); } + /// Allocate memory with specified queue, and init memory if has initial + /// value. + void init(sycl::queue &q) { + if (_device_ptr) + return; + if (!_size) + return; + allocate_device(q); + if (_host_ptr) + detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, + host_to_device); + } + + /// The variable is assigned to a device pointer. + void assign(value_t *src, size_t size) { + this->~device_memory(); + new (this) device_memory(src, size); + } + + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr() { return get_ptr(get_default_queue()); } + /// Get memory pointer of the memory object, which is virtual pointer when + /// usm is not used, and device pointer when usm is used. + value_t *get_ptr(sycl::queue &q) { + init(q); + return _device_ptr; + } + + /// Get the device memory object size in bytes. + size_t get_size() { return _size; } + + template + typename std::enable_if::type &operator[](size_t index) { + init(); + return _device_ptr[index]; + } + + /// Get dpct::accessor with dimension info for the device memory object + /// when usm is used and dimension is greater than 1. + template + typename std::enable_if::type + get_access([[maybe_unused]] sycl::handler &cgh) { + return dpct_accessor_t((T *)_device_ptr, _range); + } + + private: + device_memory(value_t *memory_ptr, size_t size) + : _size(size), _range(size / sizeof(T)), _reference(true), + _device_ptr(memory_ptr) {} + + void allocate_device(sycl::queue &q) { + #ifndef DPCT_USM_LEVEL_NONE + if (Memory == shared) { + _device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(), + q.get_context()); + return; + } + #ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY + if (Memory == constant) { + _device_ptr = (value_t *)sycl::malloc_device( + _size, q.get_device(), q.get_context(), + sycl::ext::oneapi::property::usm::device_read_only()); + return; + } + #endif + #endif + _device_ptr = (value_t *)detail::dpct_malloc(_size, q); + } + + size_t _size; + sycl::range _range; + bool _reference; + value_t *_host_ptr; + value_t *_device_ptr; + }; + template + class device_memory : public device_memory { + public: + using base = device_memory; + using value_t = typename base::value_t; + using accessor_t = + typename detail::memory_traits::template accessor_t<0>; + + /// Constructor with initial value. + device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {} + + /// Default constructor + device_memory() : base(1) {} + }; + } // namespace detail + + template + using global_memory = detail::device_memory; + template + using constant_memory = detail::device_memory; + template + using shared_memory = detail::device_memory; + + + template + inline T atomic_fetch_add(T *addr, T operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_add(operand); + } + + template + inline T1 atomic_fetch_add(T1 *addr, T2 operand) { + auto atm = + sycl::atomic_ref(addr[0]); + return atm.fetch_add(operand); + } + + template + inline T atomic_fetch_add(T *addr, T operand, + sycl::memory_order memoryOrder) { + switch (memoryOrder) { + case sycl::memory_order::relaxed: + return atomic_fetch_add(addr, operand); + case sycl::memory_order::acq_rel: + return atomic_fetch_add(addr, operand); + case sycl::memory_order::seq_cst: + return atomic_fetch_add(addr, operand); + default: + assert(false && "Invalid memory_order for atomics. Valid memory_order for " + "atomics are: sycl::memory_order::relaxed, " + "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!"); + } + } + + template + inline T1 atomic_fetch_add(T1 *addr, T2 operand, + sycl::memory_order memoryOrder) { + atomic_fetch_add(addr, operand, memoryOrder); + } + +} // COPY from DPCT head files + +#endif // GGML_SYCL_DPCT_HELPER_HPP diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 810995d0cbf..572e32779e5 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1,1244 +1,1244 @@ -#include "common.hpp" -#include "ggml-sycl/presets.hpp" -#include "ggml.h" -#include "element_wise.hpp" - -#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ - for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) - -#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \ - (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX)) - - -static void acc_f32(const float * x, const float * y, float * dst, const int ne, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) { - const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0); - if (i >= ne) { - return; - } - int src1_idx = i - offset; - int oz = src1_idx / nb2; - int oy = (src1_idx - (oz * nb2)) / nb1; - int ox = src1_idx % nb1; - if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { - dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; - } else { - dst[i] = x[i]; - } -} - -/* Unary OP funcs */ -template -static __dpct_inline__ T op_sgn(T x) { - return x > static_cast(0.f) ? static_cast(1.f) : ((x < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); -} - -template -static __dpct_inline__ T op_abs(T x) { - return sycl::fabs(x); -} - -template -static __dpct_inline__ T op_elu(T x) { - return (x > static_cast(0.f)) ? x : sycl::expm1(x); -} - -template -static __dpct_inline__ T op_gelu(T x) { - const T GELU_COEF_A = static_cast(0.044715f); - const T SQRT_2_OVER_PI = static_cast(0.79788456080286535587989211986876f); - return static_cast(0.5f) * x * - (static_cast(1.0f) + - sycl::tanh(SQRT_2_OVER_PI * x * (static_cast(1.0f) + GELU_COEF_A * x * x))); -} - -template -static __dpct_inline__ T op_silu(T x) { - return x / (static_cast(1.0f) + sycl::native::exp(-x)); -} - -template -static __dpct_inline__ T op_gelu_quick(T x) { - const T GELU_QUICK_COEF_LOCAL = static_cast(-1.702f); - return x * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x))); -} - -template -static __dpct_inline__ T op_gelu_erf(T x) { - const T SQRT_2_INV = static_cast(0.70710678118654752440084436210484f); - return static_cast(0.5f) * x * (static_cast(1.0f) + sycl::erf(x * SQRT_2_INV)); -} - -template -static __dpct_inline__ T op_tanh(T x) { - return sycl::tanh(x); -} - -template -static __dpct_inline__ T op_relu(T x) { - return sycl::fmax(x, static_cast(0)); -} - -template -static __dpct_inline__ T op_sigmoid(T x) { - return static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(-x)); -} - -template -static __dpct_inline__ T op_sqrt(T x) { - return sycl::sqrt(x); -} - -template -static __dpct_inline__ T op_sin(T x) { - return sycl::sin(x); -} - -template -static __dpct_inline__ T op_cos(T x) { - return sycl::cos(x); -} - -template -static __dpct_inline__ T op_hardsigmoid(T x) { - return sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); -} - -template -static __dpct_inline__ T op_hardswish(T x) { - return x * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); -} - -template -static __dpct_inline__ T op_exp(T x) { - return sycl::exp(x); -} - -template -static __dpct_inline__ T op_log(T x) { - if (x <= static_cast(0)) { - return neg_infinity(); - } - return sycl::log(x); -} - -template -static __dpct_inline__ T op_neg(T x) { - return -x; -} - -template -static __dpct_inline__ T op_step(T x) { - return (x > static_cast(0.0f)) ? static_cast(1.0f) : static_cast(0.0f); -} - -template -static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) { - T neg_slope_T = static_cast(negative_slope); - return sycl::fmax(x, static_cast(0)) + - sycl::fmin(x, static_cast(0.0f)) * neg_slope_T; -} - -template -static __dpct_inline__ T op_sqr(T x) { - return x * x; -} - -template -static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) { - return x < static_cast(min_val) ? static_cast(min_val) : (x > static_cast(max_val) ? static_cast(max_val) : x); -} - -template -static __dpct_inline__ T op_floor(T x) { - return sycl::floor(x); -} - -template -static __dpct_inline__ T op_ceil(T x) { - return sycl::ceil(x); -} - -template -static __dpct_inline__ T op_round(T x) { - return sycl::round(x); -} - -template -static __dpct_inline__ T op_trunc(T x) { - return sycl::trunc(x); -} - -template -static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sgn(x[i]); - } -} - -template -static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_abs(x[i]); - } -} - -template -static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_elu(x[i]); - } -} - -template -static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_gelu(x[i]); - } -} - -template -static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_silu(x[i]); - } -} - -template -static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_gelu_quick(x[i]); - } -} - -template -static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_gelu_erf(x[i]); - } -} - -template -static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_tanh(x[i]); - } -} - -template -static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_relu(x[i]); - } -} - -template -static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sigmoid(x[i]); - } -} - -template -static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sqrt(x[i]); - } -} - -template -static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sin(x[i]); - } -} - -template -static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_cos(x[i]); - } -} - -template -static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_hardsigmoid(x[i]); - } -} - -template -static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_hardswish(x[i]); - } -} - -template -static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_exp(x[i]); - } -} - -template -static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_log(x[i]); - } -} - -template -static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_neg(x[i]); - } -} - -template -static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_step(x[i]); - } -} - -template -static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_leaky_relu(x[i], negative_slope); - } -} - -template -static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_sqr(x[i]); - } -} - -template -static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_clamp(x[i], min_val, max_val); - } -} - -template -static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_floor(x[i]); - } -} - -template -static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_ceil(x[i]); - } -} - -template -static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_round(x[i]); - } -} - -template -static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_trunc(x[i]); - } -} - -template -static void upscale(const T *x, T *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { - int index = item_ct1.get_local_id(0) + - item_ct1.get_group(0) * item_ct1.get_local_range(0); - if (index >= ne10 * ne11 * ne12 * ne13) { - return; - } - // operation - int i10 = index % ne10; - int i11 = (index / ne10) % ne11; - int i12 = (index / (ne10 * ne11)) % ne12; - int i13 = (index / (ne10 * ne11 * ne12)) % ne13; - - int i00 = static_cast(i10 / sf0); - int i01 = static_cast(i11 / sf1); - int i02 = static_cast(i12 / sf2); - int i03 = static_cast(i13 / sf3); - - dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); -} - -template -static void clamp(const T * x, T * dst, const float min, const float max, const int k, - const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = x[i] < static_cast(min) ? static_cast(min) : (x[i] > static_cast(max) ? static_cast(max) : x[i]); - } -} - -template -static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - const int64_t j0 = (i / n) * o0 + (i % n); - const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); - dst[i] = op_gelu(x[j0]) * g[j1]; - } -} - -template -static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - const int64_t j0 = (i / n) * o0 + (i % n); - const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); - dst[i] = op_relu(x[j0]) * g[j1]; - } -} - -template -static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - const int64_t j0 = (i / n) * o0 + (i % n); - const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); - dst[i] = op_silu(x[j0]) * g[j1]; - } -} - -template -static void gated_op_fused_geglu_erf(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - const int64_t j0 = (i / n) * o0 + (i % n); - const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); - dst[i] = op_gelu_erf(x[j0]) * g[j1]; - } -} - -template -static void gated_op_fused_geglu_quick(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - const int64_t j0 = (i / n) * o0 + (i % n); - const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); - dst[i] = op_gelu_quick(x[j0]) * g[j1]; - } -} - -namespace ggml_sycl_detail { -static void acc_f32_sycl(const float *x, const float *y, float *dst, - const int n_elements, const int ne10, const int ne11, - const int ne12, const int nb1, const int nb2, - const int offset, queue_ptr stream) { - int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * - sycl::range<1>(SYCL_ACC_BLOCK_SIZE), - sycl::range<1>(SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, - item_ct1); - }); -} - -template -static void arange_kernel(T * dst, const int k, T start, T step, - const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = start + static_cast(i) * step; - } -} - -template -static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, - const int nb02, const int nb03, const int ne10, const int ne11, - const int ne12, const int ne13, const float sf0, const float sf1, - const float sf2, const float sf3, queue_ptr stream) { - int dst_size = ne10 * ne11 * ne12 * ne13; - int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE); - sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1); - }); -} - -template -static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -template -static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;; - GGML_ASSERT(dst->ne[0] == nc); - GGML_ASSERT(ggml_is_contiguous_1(dst->src[0])); - GGML_ASSERT(ggml_is_contiguous(dst)); - const int32_t swapped = ((const int32_t *) dst->op_params)[1]; - void * src0_d = src0->data; - void * src1_d = src1 ? src1->data : src0->data; - const int64_t src0_o = src0->nb[1]; - const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; - void * dst_d = dst->data; - if (src1) { - GGML_ASSERT(ggml_is_contiguous_1(src1)); - GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); - GGML_ASSERT(src1->ne[0] == nc); - GGML_ASSERT(src0->type == src1->type); - } - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - sycl::half * src0_p = (sycl::half *) src0_d; - sycl::half * src1_p = (sycl::half *) src1_d; - - if (!src1) { - src0_p += swapped ? nc : 0; - src1_p += swapped ? 0 : nc; - } - kernel_invoker(src0_p, - src1_p, - (sycl::half *) dst_d, - ggml_nelements(dst), - nc, - src0_o / sizeof(sycl::half), - src1_o / sizeof(sycl::half), - main_stream, - std::forward(args)...); - break; - } -#endif - case GGML_TYPE_F32: - { - float * src0_p = (float *) src0_d; - float * src1_p = (float *) src1_d; - - if (!src1) { - src0_p += swapped ? nc : 0; - src1_p += swapped ? 0 : nc; - } - - kernel_invoker(src0_p, - src1_p, - (float *) dst_d, - ggml_nelements(dst), - nc, - src0_o / sizeof(float), - src1_o / sizeof(float), - main_stream, - std::forward(args)...); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - -template -static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { -#if defined (GGML_SYCL_F16) - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); -#else - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); -#endif - GGML_ASSERT(dst->src[0]->type == dst->type); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0]; - const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1]; - const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2]; - const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3]; - switch (dst->type) { -#if defined (GGML_SYCL_F16) - case GGML_TYPE_F16: - { - auto data_pts = cast_data(dst); - kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], - (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, - main_stream, std::forward(args)...); - break; - } -#endif - case GGML_TYPE_F32: - { - auto data_pts = cast_data(dst); - kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], - (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, - main_stream, std::forward(args)...); - break; - } - default: - GGML_ABORT("GGML tensor type not supported!\n"); - } -} - - -static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->type == GGML_TYPE_F32); - float start, stop, step; - memcpy(&start, dst->op_params, sizeof(float)); - memcpy(&stop, (float *) dst->op_params + 1, sizeof(float)); - memcpy(&step, (float *) dst->op_params + 2, sizeof(float)); - dpct::queue_ptr stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - float * dst_ptr = (float *)dst->data; - const int k = (int)ggml_nelements(dst); - const int num_blocks = ceil_div(k, SYCL_ARANGE_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE), - sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - arange_kernel(dst_ptr, k, start, step, item_ct1); - }); -} - -} // namespace ggml_sycl_detail - - - -static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE), - sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), - sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), - sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), - sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE), - sycl::range<1>(SYCL_TANH_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), - sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE), - sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE), - sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), - sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), - sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), - sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), - sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE), - sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE), - sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), - sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), - sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - float negative_slope; - memcpy(&negative_slope, dst->op_params, sizeof(float)); - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) { - const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), - sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1); - }); - }, negative_slope); -} - -static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE), - sycl::range<1>(SYCL_SQR_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst, - [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03, - int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3, - queue_ptr stream) { - ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream); - }); -} - -static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - float min_val; - float max_val; - memcpy(&min_val, dst->op_params, sizeof(float)); - memcpy(&max_val, (float *) dst->op_params + 1, sizeof(float)); - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) { - const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE), - sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1); - }); - }, min_val, max_val); -} - -static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_ceil_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - const float * src1_dd = static_cast(dst->src[1]->data); - float * dst_dd = static_cast(dst->data); - - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes - - ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream); -} - -static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, - [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { - const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); - main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, - [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { - const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu - main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, - [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { - const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu - main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, - [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { - const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); - main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); - }); - }); -} - -static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, - [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { - const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); - main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { - gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); - }); - }); -} - - -void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_sqrt(ctx, dst); -} - -void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_sin(ctx, dst); -} - -void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_cos(ctx, dst); -} - -void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); - ggml_sycl_op_acc(ctx, dst); -} - -void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_gelu(ctx, dst); -} - -void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_silu(ctx, dst); -} - -void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_gelu_quick(ctx, dst); -} - -void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_gelu_erf(ctx, dst); -} - -void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_tanh(ctx, dst); -} - -void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_relu(ctx, dst); -} - -void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_sigmoid(ctx, dst); -} - -void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_hardsigmoid(ctx, dst); -} - -void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_hardswish(ctx, dst); -} - -void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_exp(ctx, dst); -} - -void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_log(ctx, dst); -} - -void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_neg(ctx, dst); -} - -void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_step(ctx, dst); -} - -void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_leaky_relu(ctx, dst); -} - -void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_sqr(ctx, dst); -} - -void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_upscale(ctx, dst); -} - - -void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_clamp(ctx, dst); -} - -void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_sgn(ctx, dst); -} - -void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_abs(ctx, dst); -} - -void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_elu(ctx, dst); -} - -void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_geglu(ctx, dst); -} - -void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_reglu(ctx, dst); -} - -void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_swiglu(ctx, dst); -} - -void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_geglu_erf(ctx, dst); -} - -void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_geglu_quick(ctx, dst); -} - -void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/0); - ggml_sycl_detail::ggml_sycl_op_arange(ctx, dst); -} - -void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_floor(ctx, dst); -} - -void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_ceil(ctx, dst); -} - -void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_round(ctx, dst); -} - -void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); - ggml_sycl_op_trunc(ctx, dst); -} +#include "common.hpp" +#include "ggml-sycl/presets.hpp" +#include "ggml.h" +#include "element_wise.hpp" + +#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ + for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) + +#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \ + (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX)) + + +static void acc_f32(const float * x, const float * y, float * dst, const int ne, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, int offset, const sycl::nd_item<1> &item_ct1) { + const int i = SYCL_LOCAL_ID_CALC(item_ct1, 0); + if (i >= ne) { + return; + } + int src1_idx = i - offset; + int oz = src1_idx / nb2; + int oy = (src1_idx - (oz * nb2)) / nb1; + int ox = src1_idx % nb1; + if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { + dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; + } else { + dst[i] = x[i]; + } +} + +/* Unary OP funcs */ +template +static __dpct_inline__ T op_sgn(T x) { + return x > static_cast(0.f) ? static_cast(1.f) : ((x < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); +} + +template +static __dpct_inline__ T op_abs(T x) { + return sycl::fabs(x); +} + +template +static __dpct_inline__ T op_elu(T x) { + return (x > static_cast(0.f)) ? x : sycl::expm1(x); +} + +template +static __dpct_inline__ T op_gelu(T x) { + const T GELU_COEF_A = static_cast(0.044715f); + const T SQRT_2_OVER_PI = static_cast(0.79788456080286535587989211986876f); + return static_cast(0.5f) * x * + (static_cast(1.0f) + + sycl::tanh(SQRT_2_OVER_PI * x * (static_cast(1.0f) + GELU_COEF_A * x * x))); +} + +template +static __dpct_inline__ T op_silu(T x) { + return x / (static_cast(1.0f) + sycl::native::exp(-x)); +} + +template +static __dpct_inline__ T op_gelu_quick(T x) { + const T GELU_QUICK_COEF_LOCAL = static_cast(-1.702f); + return x * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x))); +} + +template +static __dpct_inline__ T op_gelu_erf(T x) { + const T SQRT_2_INV = static_cast(0.70710678118654752440084436210484f); + return static_cast(0.5f) * x * (static_cast(1.0f) + sycl::erf(x * SQRT_2_INV)); +} + +template +static __dpct_inline__ T op_tanh(T x) { + return sycl::tanh(x); +} + +template +static __dpct_inline__ T op_relu(T x) { + return sycl::fmax(x, static_cast(0)); +} + +template +static __dpct_inline__ T op_sigmoid(T x) { + return static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(-x)); +} + +template +static __dpct_inline__ T op_sqrt(T x) { + return sycl::sqrt(x); +} + +template +static __dpct_inline__ T op_sin(T x) { + return sycl::sin(x); +} + +template +static __dpct_inline__ T op_cos(T x) { + return sycl::cos(x); +} + +template +static __dpct_inline__ T op_hardsigmoid(T x) { + return sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); +} + +template +static __dpct_inline__ T op_hardswish(T x) { + return x * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); +} + +template +static __dpct_inline__ T op_exp(T x) { + return sycl::exp(x); +} + +template +static __dpct_inline__ T op_log(T x) { + if (x <= static_cast(0)) { + return neg_infinity(); + } + return sycl::log(x); +} + +template +static __dpct_inline__ T op_neg(T x) { + return -x; +} + +template +static __dpct_inline__ T op_step(T x) { + return (x > static_cast(0.0f)) ? static_cast(1.0f) : static_cast(0.0f); +} + +template +static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) { + T neg_slope_T = static_cast(negative_slope); + return sycl::fmax(x, static_cast(0)) + + sycl::fmin(x, static_cast(0.0f)) * neg_slope_T; +} + +template +static __dpct_inline__ T op_sqr(T x) { + return x * x; +} + +template +static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) { + return x < static_cast(min_val) ? static_cast(min_val) : (x > static_cast(max_val) ? static_cast(max_val) : x); +} + +template +static __dpct_inline__ T op_floor(T x) { + return sycl::floor(x); +} + +template +static __dpct_inline__ T op_ceil(T x) { + return sycl::ceil(x); +} + +template +static __dpct_inline__ T op_round(T x) { + return sycl::round(x); +} + +template +static __dpct_inline__ T op_trunc(T x) { + return sycl::trunc(x); +} + +template +static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sgn(x[i]); + } +} + +template +static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_abs(x[i]); + } +} + +template +static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_elu(x[i]); + } +} + +template +static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu(x[i]); + } +} + +template +static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_silu(x[i]); + } +} + +template +static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu_quick(x[i]); + } +} + +template +static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_gelu_erf(x[i]); + } +} + +template +static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_tanh(x[i]); + } +} + +template +static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_relu(x[i]); + } +} + +template +static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sigmoid(x[i]); + } +} + +template +static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sqrt(x[i]); + } +} + +template +static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sin(x[i]); + } +} + +template +static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_cos(x[i]); + } +} + +template +static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_hardsigmoid(x[i]); + } +} + +template +static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_hardswish(x[i]); + } +} + +template +static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_exp(x[i]); + } +} + +template +static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_log(x[i]); + } +} + +template +static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_neg(x[i]); + } +} + +template +static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_step(x[i]); + } +} + +template +static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_leaky_relu(x[i], negative_slope); + } +} + +template +static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_sqr(x[i]); + } +} + +template +static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_clamp(x[i], min_val, max_val); + } +} + +template +static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_floor(x[i]); + } +} + +template +static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_ceil(x[i]); + } +} + +template +static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_round(x[i]); + } +} + +template +static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_trunc(x[i]); + } +} + +template +static void upscale(const T *x, T *dst, const int nb00, const int nb01, + const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int ne13, const float sf0, const float sf1, + const float sf2, const float sf3, const sycl::nd_item<1> &item_ct1) { + int index = item_ct1.get_local_id(0) + + item_ct1.get_group(0) * item_ct1.get_local_range(0); + if (index >= ne10 * ne11 * ne12 * ne13) { + return; + } + // operation + int i10 = index % ne10; + int i11 = (index / ne10) % ne11; + int i12 = (index / (ne10 * ne11)) % ne12; + int i13 = (index / (ne10 * ne11 * ne12)) % ne13; + + int i00 = static_cast(i10 / sf0); + int i01 = static_cast(i11 / sf1); + int i02 = static_cast(i12 / sf2); + int i03 = static_cast(i13 / sf3); + + dst[index] = *(const T *)((const char *)x + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00); +} + +template +static void clamp(const T * x, T * dst, const float min, const float max, const int k, + const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = x[i] < static_cast(min) ? static_cast(min) : (x[i] > static_cast(max) ? static_cast(max) : x[i]); + } +} + +template +static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_relu(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_silu(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_geglu_erf(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu_erf(x[j0]) * g[j1]; + } +} + +template +static void gated_op_fused_geglu_quick(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + const int64_t j0 = (i / n) * o0 + (i % n); + const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + dst[i] = op_gelu_quick(x[j0]) * g[j1]; + } +} + +namespace ggml_sycl_detail { +static void acc_f32_sycl(const float *x, const float *y, float *dst, + const int n_elements, const int ne10, const int ne11, + const int ne12, const int nb1, const int nb2, + const int offset, queue_ptr stream) { + int num_blocks = ceil_div(n_elements, SYCL_ACC_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * + sycl::range<1>(SYCL_ACC_BLOCK_SIZE), + sycl::range<1>(SYCL_ACC_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, + item_ct1); + }); +} + +template +static void arange_kernel(T * dst, const int k, T start, T step, + const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = start + static_cast(i) * step; + } +} + +template +static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, + const int nb02, const int nb03, const int ne10, const int ne11, + const int ne12, const int ne13, const float sf0, const float sf1, + const float sf2, const float sf3, queue_ptr stream) { + int dst_size = ne10 * ne11 * ne12 * ne13; + int num_blocks = ceil_div(dst_size, SYCL_UPSCALE_BLOCK_SIZE); + sycl::range<1> gridDim(num_blocks * SYCL_UPSCALE_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(gridDim, sycl::range<1>(SYCL_UPSCALE_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + upscale(x, dst, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, item_ct1); + }); +} + +template +static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + } +} + +template +static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const int64_t nc = src1 ? src0->ne[0] : src0->ne[0] / 2;; + GGML_ASSERT(dst->ne[0] == nc); + GGML_ASSERT(ggml_is_contiguous_1(dst->src[0])); + GGML_ASSERT(ggml_is_contiguous(dst)); + const int32_t swapped = ((const int32_t *) dst->op_params)[1]; + void * src0_d = src0->data; + void * src1_d = src1 ? src1->data : src0->data; + const int64_t src0_o = src0->nb[1]; + const int64_t src1_o = src1 ? src1->nb[1] : src0->nb[1]; + void * dst_d = dst->data; + if (src1) { + GGML_ASSERT(ggml_is_contiguous_1(src1)); + GGML_ASSERT(src1->nb[0] == ggml_element_size(src1)); + GGML_ASSERT(src1->ne[0] == nc); + GGML_ASSERT(src0->type == src1->type); + } + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + sycl::half * src0_p = (sycl::half *) src0_d; + sycl::half * src1_p = (sycl::half *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + kernel_invoker(src0_p, + src1_p, + (sycl::half *) dst_d, + ggml_nelements(dst), + nc, + src0_o / sizeof(sycl::half), + src1_o / sizeof(sycl::half), + main_stream, + std::forward(args)...); + break; + } +#endif + case GGML_TYPE_F32: + { + float * src0_p = (float *) src0_d; + float * src1_p = (float *) src1_d; + + if (!src1) { + src0_p += swapped ? nc : 0; + src1_p += swapped ? 0 : nc; + } + + kernel_invoker(src0_p, + src1_p, + (float *) dst_d, + ggml_nelements(dst), + nc, + src0_o / sizeof(float), + src1_o / sizeof(float), + main_stream, + std::forward(args)...); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + } +} + +template +static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { +#if defined (GGML_SYCL_F16) + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); +#else + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); +#endif + GGML_ASSERT(dst->src[0]->type == dst->type); + + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + + const float sf0 = (float) dst->ne[0] / dst->src[0]->ne[0]; + const float sf1 = (float) dst->ne[1] / dst->src[0]->ne[1]; + const float sf2 = (float) dst->ne[2] / dst->src[0]->ne[2]; + const float sf3 = (float) dst->ne[3] / dst->src[0]->ne[3]; + switch (dst->type) { +#if defined (GGML_SYCL_F16) + case GGML_TYPE_F16: + { + auto data_pts = cast_data(dst); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], + (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, + main_stream, std::forward(args)...); + break; + } +#endif + case GGML_TYPE_F32: + { + auto data_pts = cast_data(dst); + kernel_invoker(data_pts.src, data_pts.dst, (int)dst->src[0]->nb[0], (int)dst->src[0]->nb[1], (int)dst->src[0]->nb[2], + (int)dst->src[0]->nb[3], (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], sf0, sf1, sf2, sf3, + main_stream, std::forward(args)...); + break; + } + default: + GGML_ABORT("GGML tensor type not supported!\n"); + } +} + + +static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + float start, stop, step; + memcpy(&start, dst->op_params, sizeof(float)); + memcpy(&stop, (float *) dst->op_params + 1, sizeof(float)); + memcpy(&step, (float *) dst->op_params + 2, sizeof(float)); + dpct::queue_ptr stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + float * dst_ptr = (float *)dst->data; + const int k = (int)ggml_nelements(dst); + const int num_blocks = ceil_div(k, SYCL_ARANGE_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE), + sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + arange_kernel(dst_ptr, k, start, step, item_ct1); + }); +} + +} // namespace ggml_sycl_detail + + + +static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE), + sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE), + sycl::range<1>(SYCL_TANH_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE), + sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE), + sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), + sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), + sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), + sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), + sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE), + sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE), + sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), + sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), + sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float slope) { + const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1); + }); + }, negative_slope); +} + +static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE), + sycl::range<1>(SYCL_SQR_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst, + [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03, + int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3, + queue_ptr stream) { + ggml_sycl_detail::upscale_sycl(src, dst_ptr, nb00, nb01, nb02, nb03, ne10, ne11, ne12, ne13, sf0, sf1, sf2, sf3, stream); + }); +} + +static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + float min_val; + float max_val; + memcpy(&min_val, dst->op_params, sizeof(float)); + memcpy(&max_val, (float *) dst->op_params + 1, sizeof(float)); + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream, float min_arg, float max_arg) { + const int num_blocks = ceil_div(k_elements, SYCL_CLAMP_BLOCK_SIZE); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE), + sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1); + }); + }, min_val, max_val); +} + +static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_ceil_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + const float * src1_dd = static_cast(dst->src[1]->data); + float * dst_dd = static_cast(dst->data); + + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + + ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream); +} + +static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + main_stream->parallel_for( + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu + main_stream->parallel_for( + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu + main_stream->parallel_for( + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + main_stream->parallel_for( + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, + [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { + const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); + main_stream->parallel_for( + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); + }); + }); +} + + +void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_sqrt(ctx, dst); +} + +void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_sin(ctx, dst); +} + +void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_cos(ctx, dst); +} + +void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_acc(ctx, dst); +} + +void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_gelu(ctx, dst); +} + +void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_silu(ctx, dst); +} + +void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_gelu_quick(ctx, dst); +} + +void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_gelu_erf(ctx, dst); +} + +void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_tanh(ctx, dst); +} + +void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_relu(ctx, dst); +} + +void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_sigmoid(ctx, dst); +} + +void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_hardsigmoid(ctx, dst); +} + +void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_hardswish(ctx, dst); +} + +void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_exp(ctx, dst); +} + +void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_log(ctx, dst); +} + +void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_neg(ctx, dst); +} + +void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_step(ctx, dst); +} + +void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_leaky_relu(ctx, dst); +} + +void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_sqr(ctx, dst); +} + +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_upscale(ctx, dst); +} + + +void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_clamp(ctx, dst); +} + +void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_sgn(ctx, dst); +} + +void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_abs(ctx, dst); +} + +void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_elu(ctx, dst); +} + +void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu(ctx, dst); +} + +void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_reglu(ctx, dst); +} + +void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_swiglu(ctx, dst); +} + +void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu_erf(ctx, dst); +} + +void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_geglu_quick(ctx, dst); +} + +void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/0); + ggml_sycl_detail::ggml_sycl_op_arange(ctx, dst); +} + +void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_floor(ctx, dst); +} + +void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_ceil(ctx, dst); +} + +void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_round(ctx, dst); +} + +void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_trunc(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index fcf93295cb2..21bef273725 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -1,90 +1,90 @@ -#ifndef GGML_SYCL_ELEMENTWISE_HPP -#define GGML_SYCL_ELEMENTWISE_HPP - -#include "common.hpp" -#include "ggml.h" -#include // For std::numeric_limits - -template -T neg_infinity() { - return -std::numeric_limits::infinity(); -} - -template -struct typed_data { - const T_Src * src; - T_Dst * dst; -}; - -template -typed_data cast_data(ggml_tensor * dst) { - return { - /* .src = */ static_cast(dst->src[0]->data), - /* .dst = */ static_cast(dst->data) - }; -} - -const float GELU_QUICK_COEF = -1.702f; - - -void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst); -void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst); - -#endif // GGML_SYCL_ELEMENTWISE_HPP +#ifndef GGML_SYCL_ELEMENTWISE_HPP +#define GGML_SYCL_ELEMENTWISE_HPP + +#include "common.hpp" +#include "ggml.h" +#include // For std::numeric_limits + +template +T neg_infinity() { + return -std::numeric_limits::infinity(); +} + +template +struct typed_data { + const T_Src * src; + T_Dst * dst; +}; + +template +typed_data cast_data(ggml_tensor * dst) { + return { + /* .src = */ static_cast(dst->src[0]->data), + /* .dst = */ static_cast(dst->data) + }; +} + +const float GELU_QUICK_COEF = -1.702f; + + +void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_acc(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_ELEMENTWISE_HPP diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp index dcf6c7aeeb4..7a16158184e 100644 --- a/ggml/src/ggml-sycl/gemm.hpp +++ b/ggml/src/ggml-sycl/gemm.hpp @@ -1,90 +1,90 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_GEMM_HPP -#define GGML_SYCL_GEMM_HPP - -#include "ggml-sycl.h" - -#if GGML_SYCL_DNNL - -#include "dnnl.hpp" -#include "dnnl_sycl.hpp" - -class DnnlGemmWrapper { -public: - using dt = dnnl::memory::data_type; - using tag = dnnl::memory::format_tag; - - template - static constexpr dt to_dt() { - if constexpr (std::is_same_v) return dt::f32; - else if constexpr (std::is_same_v) return dt::f16; - else static_assert(0); - } - - static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k, - const void * a, dt at, dnnl_dim_t stra0, dnnl_dim_t stra1, dnnl_dim_t stra2, - const void * b, dt bt, dnnl_dim_t strb0, dnnl_dim_t strb1, dnnl_dim_t strb2, - void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) { - - auto stream = ctx.stream_dnnl(q); - auto eng = ctx.engine_dnnl(q); - - dnnl::memory::dims a_dims = {batches_a, m, k }; - dnnl::memory::dims a_strides = {stra2, stra1, stra0}; - const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides); - - dnnl::memory::dims b_dims = {batches_b, k, n }; - dnnl::memory::dims b_strides = {strb2, strb0, strb1}; - const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides); - - dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n}; - dnnl::memory::dims c_strides = {m*n, 1, m }; - const auto c_md = dnnl::memory::desc(c_dims, ct, c_strides); - dnnl::primitive_attr primitive_attr; - primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); - -#ifdef GGML_SYCL_F16 - primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16); -#endif - - auto a_mem = dnnl::memory(a_in_md, eng, const_cast(a)); - auto b_mem = dnnl::memory(b_in_md, eng, const_cast(b)); - auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md, primitive_attr); - auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c); - - auto scratchpad_md = matmul_pd.scratchpad_desc(); - auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q); - - auto matmul_prim = dnnl::matmul(matmul_pd); - - std::unordered_map matmul_args; - matmul_args.insert({ DNNL_ARG_SRC, a_mem }); - matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem }); - - matmul_args.insert({ DNNL_ARG_DST, c_mem }); - matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem }); - - matmul_prim.execute(stream, matmul_args); - } - - static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k, - const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) { - - gemm(ctx, m, n, k, a, at, 1, k, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1); - } -}; - -#endif - -#endif // GGML_SYCL_GEMM_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_GEMM_HPP +#define GGML_SYCL_GEMM_HPP + +#include "ggml-sycl.h" + +#if GGML_SYCL_DNNL + +#include "dnnl.hpp" +#include "dnnl_sycl.hpp" + +class DnnlGemmWrapper { +public: + using dt = dnnl::memory::data_type; + using tag = dnnl::memory::format_tag; + + template + static constexpr dt to_dt() { + if constexpr (std::is_same_v) return dt::f32; + else if constexpr (std::is_same_v) return dt::f16; + else static_assert(0); + } + + static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k, + const void * a, dt at, dnnl_dim_t stra0, dnnl_dim_t stra1, dnnl_dim_t stra2, + const void * b, dt bt, dnnl_dim_t strb0, dnnl_dim_t strb1, dnnl_dim_t strb2, + void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches_a, dnnl_dim_t batches_b) { + + auto stream = ctx.stream_dnnl(q); + auto eng = ctx.engine_dnnl(q); + + dnnl::memory::dims a_dims = {batches_a, m, k }; + dnnl::memory::dims a_strides = {stra2, stra1, stra0}; + const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides); + + dnnl::memory::dims b_dims = {batches_b, k, n }; + dnnl::memory::dims b_strides = {strb2, strb0, strb1}; + const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides); + + dnnl::memory::dims c_dims = { std::max(batches_a, batches_b), m, n}; + dnnl::memory::dims c_strides = {m*n, 1, m }; + const auto c_md = dnnl::memory::desc(c_dims, ct, c_strides); + dnnl::primitive_attr primitive_attr; + primitive_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + +#ifdef GGML_SYCL_F16 + primitive_attr.set_fpmath_mode(dnnl::fpmath_mode::f16); +#endif + + auto a_mem = dnnl::memory(a_in_md, eng, const_cast(a)); + auto b_mem = dnnl::memory(b_in_md, eng, const_cast(b)); + auto matmul_pd = dnnl::matmul::primitive_desc(eng, a_in_md, b_in_md, c_md, primitive_attr); + auto c_mem = dnnl::memory(matmul_pd.dst_desc(), eng, c); + + auto scratchpad_md = matmul_pd.scratchpad_desc(); + auto scratchpad_mem = ctx.get_scratchpad_mem(scratchpad_md, eng, q); + + auto matmul_prim = dnnl::matmul(matmul_pd); + + std::unordered_map matmul_args; + matmul_args.insert({ DNNL_ARG_SRC, a_mem }); + matmul_args.insert({ DNNL_ARG_WEIGHTS, b_mem }); + + matmul_args.insert({ DNNL_ARG_DST, c_mem }); + matmul_args.insert({ DNNL_ARG_SCRATCHPAD, scratchpad_mem }); + + matmul_prim.execute(stream, matmul_args); + } + + static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k, + const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) { + + gemm(ctx, m, n, k, a, at, 1, k, k * m, b, bt, 1, k, n * k, c, ct, q, 1, 1); + } +}; + +#endif + +#endif // GGML_SYCL_GEMM_HPP diff --git a/ggml/src/ggml-sycl/getrows.cpp b/ggml/src/ggml-sycl/getrows.cpp index 03f8dd90748..b469d4898b9 100644 --- a/ggml/src/ggml-sycl/getrows.cpp +++ b/ggml/src/ggml-sycl/getrows.cpp @@ -1,215 +1,215 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#include "ggml-impl.h" -#include "common.hpp" -#include "dequantize.hpp" -#include "getrows.hpp" - - -template -static void k_get_rows( - const void * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2)) * - 2; - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; - - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(src0_row, ib, iqs, v); - - dst_row[iybs + iqs + 0] = v.x(); - dst_row[iybs + iqs + y_offset] = v.y(); -} - -template -static void k_get_rows_float( - const src0_t * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12, - const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { - - const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + - item_ct1.get_local_id(2); - const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) / - ne12; - const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + - item_ct1.get_local_id(0)) % - ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = src0_row[i00]; -} - -template -static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, - ggml_tensor *dst, const void *src0_dd, - const int32_t *src1_dd, float *dst_dd, - queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE); - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - GGML_ASSERT(ne00 % 2 == 0); - - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows( - src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} - -template -static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, - const ggml_tensor *src1, ggml_tensor *dst, - const src0_t *src0_dd, const int32_t *src1_dd, - float *dst_dd, queue_ptr stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); - const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; - const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, - s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); - }); - } - - GGML_UNUSED(dst); - GGML_UNUSED(ctx); -} - -void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); - GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - - const int32_t * src1_i32 = (const int32_t *) dst->src[1]->data; - /* TODO: Refactor and remove duplicates */ - switch (dst->src[0]->type) { - case GGML_TYPE_F16: - get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - case GGML_TYPE_F32: - get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - case GGML_TYPE_Q4_0: - get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - case GGML_TYPE_Q4_1: - get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - case GGML_TYPE_Q5_0: - get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - case GGML_TYPE_Q5_1: - get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - case GGML_TYPE_Q8_0: - get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, - src1_i32, (float *)dst->data, ctx.stream()); - break; - default: - // TODO: k-quants - GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type)); - GGML_ABORT("fatal error"); - } -} +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "ggml-impl.h" +#include "common.hpp" +#include "dequantize.hpp" +#include "getrows.hpp" + + +template +static void k_get_rows( + const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { + + const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2)) * + 2; + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) / + ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) % + ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; + + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x(); + dst_row[iybs + iqs + y_offset] = v.y(); +} + +template +static void k_get_rows_float( + const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12, + const sycl::nd_item<3> &item_ct1/*, size_t s13*/) { + + const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) + + item_ct1.get_local_id(2); + const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) + + item_ct1.get_local_id(1); + const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) / + ne12; + const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) + + item_ct1.get_local_id(0)) % + ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void get_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, const ggml_tensor *src1, + ggml_tensor *dst, const void *src0_dd, + const int32_t *src1_dd, float *dst_dd, + queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE); + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + GGML_ASSERT(ne00 % 2 == 0); + + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_get_rows( + src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, + s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); + }); + + GGML_UNUSED(dst); + GGML_UNUSED(ctx); +} + +template +static void get_rows_sycl_float(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const src0_t *src0_dd, const int32_t *src1_dd, + float *dst_dd, queue_ptr stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE); + const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE; + const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, + s3, nb01, nb02, nb03, s10, s11, s12, item_ct1); + }); + } + + GGML_UNUSED(dst); + GGML_UNUSED(ctx); +} + +void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(dst->src[0]->nb[0] == ggml_type_size(dst->src[0]->type)); + GGML_ASSERT(dst->src[1]->nb[0] == ggml_type_size(dst->src[1]->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + + const int32_t * src1_i32 = (const int32_t *) dst->src[1]->data; + /* TODO: Refactor and remove duplicates */ + switch (dst->src[0]->type) { + case GGML_TYPE_F16: + get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const sycl::half *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_F32: + get_rows_sycl_float(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q4_0: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q4_1: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q5_0: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q5_1: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + case GGML_TYPE_Q8_0: + get_rows_sycl(ctx, dst->src[0], dst->src[1], dst, (const float *)dst->src[0]->data, + src1_i32, (float *)dst->data, ctx.stream()); + break; + default: + // TODO: k-quants + GGML_LOG_ERROR("%s: unsupported type: %s\n", __func__, ggml_type_name(dst->src[0]->type)); + GGML_ABORT("fatal error"); + } +} diff --git a/ggml/src/ggml-sycl/getrows.hpp b/ggml/src/ggml-sycl/getrows.hpp index 1c560cd9f89..73d6ab0b912 100644 --- a/ggml/src/ggml-sycl/getrows.hpp +++ b/ggml/src/ggml-sycl/getrows.hpp @@ -1,20 +1,20 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#ifndef GGML_SYCL_GETROWS_HPP -#define GGML_SYCL_GETROWS_HPP - -#include "common.hpp" - -void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst); - -#endif // GGML_SYCL_GETROWS_HPP +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_GETROWS_HPP +#define GGML_SYCL_GETROWS_HPP + +#include "common.hpp" + +void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor *dst); + +#endif // GGML_SYCL_GETROWS_HPP diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 33f9035075b..eb3f50bd0ad 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -1,4730 +1,4730 @@ -// -// MIT license -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: MIT -// - -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "ggml-sycl.h" -#include "ggml-impl.h" -#include "ggml-backend-impl.h" - -#include "ggml-sycl/backend.hpp" -#include "ggml-sycl/common.hpp" -#include "ggml-sycl/element_wise.hpp" -#include "ggml-sycl/presets.hpp" -#include "ggml-sycl/gemm.hpp" -#include "ggml-sycl/set_rows.hpp" -#include "ggml-sycl/set.hpp" -#include "ggml-sycl/sycl_hw.hpp" -#include "ggml-sycl/getrows.hpp" -#include "ggml-sycl/quantize.hpp" -#include "ggml.h" - -static bool g_sycl_loaded = false; -int g_ggml_sycl_debug = 0; -int g_ggml_sycl_disable_optimize = 0; -int g_ggml_sycl_disable_graph = 0; -int g_ggml_sycl_disable_dnn = 0; -int g_ggml_sycl_prioritize_dmmv = 0; - -static ggml_sycl_device_info ggml_sycl_init() { - ggml_sycl_device_info info = {}; - - info.device_count = dpct::dev_mgr::instance().device_count(); - if (info.device_count == 0) { - GGML_LOG_ERROR("%s: failed to initialize: %s\n", GGML_SYCL_NAME, __func__); - return info; - } - - GGML_ASSERT(info.device_count <= GGML_SYCL_MAX_DEVICES); - - int64_t total_vram = 0; -/* This is a bit misleading; reserved for later */ -// #if defined(SYCL_USE_XMX) -// GGML_LOG_INFO("%s: SYCL_USE_XMX: yes\n", __func__); -// #else -// GGML_LOG_INFO("%s: SYCL_USE_XMX: no\n", __func__); -// #endif - for (int i = 0; i < info.device_count; ++i) { - info.devices[i].vmm = 0; - dpct::device_info prop; - sycl::device device = dpct::dev_mgr::instance().get_device(i); - - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, device))); - - info.default_tensor_split[i] = total_vram; - total_vram += prop.get_global_mem_size(); - - info.devices[i].cc = - 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - info.devices[i].nsm = prop.get_max_compute_units(); - info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); - info.devices[i].smpbo = prop.get_local_mem_size(); - - info.max_work_group_sizes[i] = prop.get_max_work_group_size(); - } - - for (int id = 0; id < info.device_count; ++id) { - info.default_tensor_split[id] /= total_vram; - } - return info; -} - -const ggml_sycl_device_info & ggml_sycl_info() { - static ggml_sycl_device_info info = ggml_sycl_init(); - return info; -} - -static void print_device_detail(int id, sycl::device &device, std::string device_type) { - - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::get_device_info(prop, device))); - - std::string version; - version += std::to_string(prop.get_major_version()); - version += "."; - version += std::to_string(prop.get_minor_version()); - - device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); - std::string name = std::string(prop.get_name()); - name = std::regex_replace(name, std::regex("\\(R\\)"), ""); - name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); - - auto global_mem_size = prop.get_global_mem_size()/1000000; - GGML_LOG_INFO("|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), - name.c_str(), version.c_str(), prop.get_max_compute_units(), - prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - global_mem_size, device.get_info().c_str()); -} - -static void print_device_opt_feature(int device_count) { - GGML_LOG_INFO("SYCL Optimization Feature:\n"); - GGML_LOG_INFO( - "|ID| Device Type|Reorder|\n"); - GGML_LOG_INFO( - "|--|-------------------|-------|\n"); - std::map DeviceNums; - for (int id = 0; id < device_count; ++id) { - sycl::device device = dpct::dev_mgr::instance().get_device(id); - std::string backend_type = get_device_backend_and_type(device); - int type_id = DeviceNums[backend_type]++; - std::stringstream device_type; - device_type << "[" << backend_type << ":" << std::to_string(type_id) - << "]"; - std::string device_type_s = device_type.str(); - device_type_s = std::regex_replace(device_type_s, std::regex("ext_oneapi_"), ""); - GGML_LOG_INFO("|%2d|%19s|%7s|\n", id, device_type_s.c_str(), - ggml_sycl_info().devices[id].opt_feature.reorder ? "Y": "N"); - } - -} -void ggml_backend_sycl_print_sycl_devices() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n"); - int device_count = dpct::dev_mgr::instance().device_count(); - std::map DeviceNums; - GGML_LOG_INFO("Found %d SYCL devices:\n", device_count); - - GGML_LOG_INFO( - "| | | | " - " |Max | |Max |Global | |\n"); - GGML_LOG_INFO( - "| | | | " - " |compute|Max work|sub |mem | |\n"); - GGML_LOG_INFO( - "|ID| Device Type| " - "Name|Version|units |group |group|size | Driver version|\n"); - GGML_LOG_INFO( - "|--|-------------------|---------------------------------------|------" - "-|-------|--------|-----|-------|---------------------|\n"); - - for (int id = 0; id < device_count; ++id) { - sycl::device device = dpct::dev_mgr::instance().get_device(id); - std::string backend_type = get_device_backend_and_type(device); - int type_id = DeviceNums[backend_type]++; - std::stringstream device_type; - device_type << "[" << backend_type << ":" << std::to_string(type_id) - << "]"; - print_device_detail(id, device, device_type.str()); - } - - print_device_opt_feature(device_count); -} - -static inline int get_sycl_env(const char *env_name, int default_val) { - char *user_device_string = getenv(env_name); - int user_number = default_val; - - unsigned n; - if (user_device_string != NULL && - sscanf(user_device_string, " %u", &n) == 1) { - user_number = (int)n; - } else { - user_number = default_val; - } - return user_number; -} - -static void ggml_check_sycl() try { - static bool initialized = false; - - if (!initialized) { - g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); - g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); - g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); - g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); - GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); - GGML_LOG_INFO("Running with Environment Variables:\n"); - GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); - GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); -#ifdef GGML_SYCL_GRAPH - GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); -#else - GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n"); -#endif -#if GGML_SYCL_DNNL - GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn); -#else - GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: DNN disabled by compile flag\n"); -#endif - GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); - GGML_LOG_INFO("Build with Macros:\n"); -#if defined(GGML_SYCL_FORCE_MMQ) - GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); -#else - GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: no\n"); -#endif -#if defined(GGML_SYCL_F16) - GGML_LOG_INFO(" GGML_SYCL_F16: yes\n"); -#else - GGML_LOG_INFO(" GGML_SYCL_F16: no\n"); -#endif - -/* NOT REMOVE, keep it for next optimize for XMX. -#if defined(SYCL_USE_XMX) - fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); -#else - fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); -#endif -*/ - - if (CHECK_TRY_ERROR(g_all_sycl_device_count = - dpct::dev_mgr::instance().device_count()) != 0) { - initialized = true; - g_sycl_loaded = false; - return; - } - GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); - - initialized = true; - g_sycl_loaded = true; - ggml_backend_sycl_print_sycl_devices(); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -/* -device_index: device index from 0 to n (continue numbers). - It is used for device select/set in SYCL backend internal data structure. -*/ -inline void check_allow_gpu_index(const int device_index) { - if (device_index >= ggml_sycl_info().device_count) { - char error_buf[256]; - snprintf( - error_buf, - sizeof(error_buf), - "%s error: device_index:%d is out of range: [0-%d]", - __func__, - device_index, - ggml_sycl_info().device_count - 1); - GGML_LOG_ERROR("%s\n", error_buf); - assert(false); - } -} - -GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len) try { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_gpu_list\n"); - for(int i=0;i=max_len) break; - id_list[i] = i; - } - return; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -// sycl buffer - -struct ggml_backend_sycl_buffer_context { - int device; - void * dev_ptr = nullptr; - queue_ptr stream; - std::string name; - optimize_feature opt_feature; - std::vector tensor_extras; - - ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : - device(device), dev_ptr(dev_ptr), stream(stream) { - check_allow_gpu_index(device); - name = (GGML_SYCL_NAME + std::to_string(device)); - opt_feature = ggml_sycl_info().devices[device].opt_feature; - } - - ~ggml_backend_sycl_buffer_context() { - if (dev_ptr != nullptr) { - ggml_sycl_set_device(device); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream))); - } - - //release extra used by tensors - for (ggml_tensor_extra_gpu * extra : tensor_extras) { - release_extra_gpu(extra); - } - - } -}; - -static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft); - -static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) { - return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name; -} - -static void -ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - ggml_sycl_set_device(ctx->device); - - delete ctx; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - return ctx->dev_ptr; -} - -static enum ggml_status -ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str()); - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; - - if (tensor->view_src != NULL) { - assert(tensor->view_src->buffer->buft == buffer->buft); - return GGML_STATUS_SUCCESS; - } - if ((tensor->type == GGML_TYPE_Q4_0 || tensor->type == GGML_TYPE_Q4_K || tensor->type == GGML_TYPE_Q6_K) && - !g_ggml_sycl_disable_optimize) { - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - tensor->extra = extra; - ctx->tensor_extras.push_back(extra); //used to release it when destroy ctx. - } - - if (ggml_is_quantized(tensor->type)) { - // initialize padding to 0 to avoid possible NaN values - size_t original_size = ggml_nbytes(tensor); - size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); - - if (padded_size > original_size && tensor->view_src == nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(ctx->stream->memset( - (char *)tensor->data + original_size, 0, - padded_size - original_size).wait())); - } - } - return GGML_STATUS_SUCCESS; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor, - const void *data, size_t offset, - size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); - GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - ggml_sycl_set_device(ctx->device); - auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); - SYCL_CHECK(CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); -#ifndef _WIN32 - // Note: Use host buffer to save the data from mmap(), then copy to device. It's workaround for mmap() issue on PVC GPU. - // This function will be called during load model from disk. Use memory buffer replace dynamic won't save more time and brings potential memory leak risk here. - char * host_buf = (char *) malloc(size); - memcpy(host_buf, data, size); - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, host_buf, size).wait())); - free(host_buf); -#else - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy((char *) tensor->data + offset, data, size).wait())); -#endif -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor *tensor, - void *data, size_t offset, - size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); - GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; - - ggml_sycl_set_device(ctx->device); - auto stream = dpct::dev_mgr::instance().get_device(ctx->device).default_queue(); - - SYCL_CHECK(CHECK_TRY_ERROR( - stream.memcpy(data, (const char *)tensor->data + offset, size) - .wait())); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, - const void *ptr_src, size_t size) { - char *host_buf = (char *)malloc(size); - q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); - q_dst.memcpy((char *)ptr_dst, host_buf, size).wait(); - free(host_buf); -} - -static bool -ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor *src, - ggml_tensor *dst) try { - bool is_cpy_supported = ggml_backend_buffer_is_sycl(src->buffer); - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": dst", dst).c_str()); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(" src", src).c_str()); - GGML_SYCL_DEBUG(" is_cpy_supported=%d\n", is_cpy_supported); - if (is_cpy_supported) { - ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; - ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)dst->buffer->context; - - ggml_sycl_set_device(src_ctx->device); - /* - DPCT1009:198: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw())); - ggml_sycl_set_device(dst_ctx->device); - /* - DPCT1009:199: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); - /* - DPCT1009:200: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - - queue_ptr stream_dst = dst_ctx->stream; - queue_ptr stream_src = src_ctx->stream; - size_t size = ggml_nbytes(src); - - //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs. - dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size); - -//todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove -#if 0 - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy( - (char *)dst->data, (const char *)src->data, size).wait())); - - /* - DPCT1009:201: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); -#endif - return true; - } - return false; - GGML_UNUSED(buffer); -} catch (const sycl::exception & exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, - uint8_t value) try { - GGML_SYCL_DEBUG("[SYCL] call %s: size=%zu\n", __func__, buffer->size); - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; - - ggml_sycl_set_device(ctx->device); - queue_ptr stream = ctx->stream; - SYCL_CHECK( - CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); - - SYCL_CHECK(CHECK_TRY_ERROR((*stream) - .memset(ctx->dev_ptr, value, buffer->size) - .wait())); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, - size_t offset, size_t size) { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); - GGML_SYCL_DEBUG(" size=%zu offset=%zu value=%u\n", size, offset, value); - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; - SYCL_CHECK(ggml_sycl_set_device(ctx->device)); - auto stream = &(dpct::dev_mgr::instance().get_device(ctx->device).default_queue()); - if (size == 0) { - return; // Nothing to do - } - if (tensor->data == nullptr) { - GGML_ABORT("Error: Tensor data pointer is null.\n"); - } - void * target_ptr = static_cast(tensor->data) + offset; - SYCL_CHECK(CHECK_TRY_ERROR((*stream).memset(target_ptr, value, size))); - SYCL_CHECK(CHECK_TRY_ERROR((*stream).wait())); -} - -static void ggml_backend_sycl_buffer_reset(ggml_backend_buffer_t buffer) { - GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__); - if (buffer == nullptr) { - return; - } - - ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *) buffer->context; - - if (ctx != nullptr) { - for (ggml_tensor_extra_gpu * extra : ctx->tensor_extras) { - release_extra_gpu(extra); - } - ctx->tensor_extras.clear(); // reset the tensor_extras vector - } -} - -static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { - /* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer, - /* .get_base = */ ggml_backend_sycl_buffer_get_base, - /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, - /* .memset_tensor = */ ggml_backend_sycl_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor, - /* .clear = */ ggml_backend_sycl_buffer_clear, - /* .reset = */ ggml_backend_sycl_buffer_reset, -}; - -// sycl buffer type -struct ggml_backend_sycl_buffer_type_context { - int device; - std::string name; - - // each buffer type has its own stream - queue_ptr stream = nullptr; -}; - -static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - - return ctx->name.c_str(); -} - -static ggml_backend_buffer_t -ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, - size_t size) try { - ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - ggml_sycl_set_device(buft_ctx->device); - const queue_ptr stream = buft_ctx->stream; - size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 - - void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( - size, *stream))); - if (!dev_ptr) { - GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); - return nullptr; - } - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); - return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; - GGML_UNUSED(buft); -} - -static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - return dpct::get_current_device().get_max_mem_alloc_size(); - - GGML_UNUSED(buft); -} - -static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - size_t size = ggml_nbytes(tensor); - int64_t ne0 = tensor->ne[0]; - - if (ggml_is_quantized(tensor->type)) { - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - } - - return size; - - GGML_UNUSED(buft); -} - -static const ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { - /* .get_name = */ ggml_backend_sycl_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size, - /* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size, - /* .is_host = */ NULL, -}; - -ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - - auto dev_count = ggml_backend_sycl_get_device_count(); - - if (device>=dev_count or device<0) { - GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", - device, dev_count-1); - GGML_ASSERT(devicedevice; - if (device>=ggml_sycl_info().device_count or device<0) { - GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", - device, ggml_sycl_info().device_count-1); - GGML_ASSERT(devicestream(i, 0)}, - }; - } - ggml_backend_sycl_buffer_type_initialized = true; - } - return &ggml_backend_sycl_buffer_types[device]; -} - -// sycl split buffer - -static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { - int64_t min_compute_capability = INT_MAX; - int64_t max_compute_capability = INT_MIN; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) { - if (min_compute_capability > ggml_sycl_info().devices[i].cc) { - min_compute_capability = ggml_sycl_info().devices[i].cc; - } - if (max_compute_capability < ggml_sycl_info().devices[i].cc) { - max_compute_capability = ggml_sycl_info().devices[i].cc; - } - } - } - - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - return max_compute_capability >= VER_GEN9 ? 128 : 64; - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ4_NL: - return max_compute_capability >= VER_GEN9 ? 128 : 64; - case GGML_TYPE_IQ3_S: - return max_compute_capability >= VER_GEN9 ? 128 : 64; - case GGML_TYPE_Q6_K: - return 64; - default: - GGML_ABORT("fatal error"); - } -} - -static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { - const int64_t nrows = ggml_nrows(tensor); - const int64_t rounding = get_row_rounding(tensor->type, tensor_split); - - *row_low = id == 0 ? 0 : nrows*tensor_split[id]; - *row_low -= *row_low % rounding; - if (id == ggml_sycl_info().device_count - 1) { - *row_high = nrows; - } else { - *row_high = nrows*tensor_split[id + 1]; - *row_high -= *row_high % rounding; - } -} - -static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); -} - -struct ggml_backend_sycl_split_buffer_type_context { - std::array tensor_split; -}; - -struct ggml_backend_sycl_split_buffer_context { - ~ggml_backend_sycl_split_buffer_context() try { - for (ggml_tensor_extra_gpu * extra : tensor_extras) { - release_extra_gpu(extra, streams); - } - } - catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); - } - - std::vector tensor_extras; - std::vector streams; -}; - -static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - delete ctx; -} - -static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { - // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced - return (void *)0x1000; - - GGML_UNUSED(buffer); -} - -static enum ggml_status -ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor, "\n").c_str()); - GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported - - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - - ctx->tensor_extras.push_back(extra); - ctx->streams.push_back(&(dpct::get_current_device().default_queue())); - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - // FIXME: do not crash if SYCL Buffer alloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; - char * buf; - /* - DPCT1009:208: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( - size, *stream))); - if (!buf) { - char err_buf[1024]; - snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size); - throw std::runtime_error(err_buf); - } - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - /* - DPCT1009:209: SYCL uses exceptions to report errors and does not use - the error codes. The original code was commented out and a warning - string was inserted. You need to rewrite this code. - */ - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memset(buf + original_size, 0, size - original_size) - .wait())); - } - - extra->data_device[i] = buf; - - for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) { - /* - DPCT1009:210: SYCL uses exceptions to report errors and does not use - the error codes. The original code was commented out and a warning - string was inserted. You need to rewrite this code. - */ - SYCL_CHECK( - CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); - } - } - tensor->extra = extra; - return GGML_STATUS_SUCCESS; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void -ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor, const void *data, - size_t offset, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); - GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); - // split tensors must always be set in their entirety at once - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - const char * buf_host = (const char *)data + offset_split; - /* - DPCT1009:211: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memcpy(extra->data_device[i], buf_host, original_size) - .wait())); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void -ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, - const ggml_tensor *tensor, void *data, - size_t offset, size_t size) try { - GGML_SYCL_DEBUG("[SYCL] call %s", __func__); - GGML_SYCL_DEBUG("%s", debug_get_tensor_str(": tensor", tensor).c_str()); - GGML_SYCL_DEBUG(" size=%zu offset=%zu\n", size, offset); - // split tensors must always be set in their entirety at once - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; - ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf_host = (char *)data + offset_split; - /* - DPCT1009:212: SYCL uses exceptions to report errors and does not use the - error codes. The original code was commented out and a warning string - was inserted. You need to rewrite this code. - */ - ggml_sycl_set_device(i); - const queue_ptr stream = ctx->streams[i]; - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memcpy(buf_host, extra->data_device[i], original_size) - .wait())); - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - GGML_UNUSED(buffer); - GGML_UNUSED(value); -} - -static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { - /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, - /* .get_base = */ ggml_backend_sycl_split_buffer_get_base, - /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_sycl_split_buffer_clear, - /* .reset = */ NULL, -}; - -// sycl split buffer type - -static const char * ggml_backend_sycl_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return GGML_SYCL_NAME "_Split"; - - GGML_UNUSED(buft); -} - -static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { - return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name; -} - -static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point - // instead, we allocate them for each tensor separately in init_tensor - // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, - // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. - ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context(); - - return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size); -} - -static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; - GGML_UNUSED(buft); -} - -static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context; - - size_t total_size = 0; - - const int64_t ne0 = tensor->ne[0]; - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - total_size += ggml_nbytes_split(tensor, nrows_split); - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - } - - return total_size; -} - -static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = { - /* .get_name = */ ggml_backend_sycl_split_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_sycl_split_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, -}; - -ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n"); - ggml_check_sycl(); - // FIXME: this is not thread safe - static std::map, struct ggml_backend_buffer_type> buft_map; - - std::array tensor_split_arr = {}; - - bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; }); - if (all_zero) { - tensor_split_arr = ggml_sycl_info().default_tensor_split; - } else { - float split_sum = 0.0f; - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - tensor_split_arr[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - tensor_split_arr[i] /= split_sum; - } - } - - auto it = buft_map.find(tensor_split_arr); - if (it != buft_map.end()) { - return &it->second; - } - - struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_sycl_split_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0), - /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr}, - }; - - auto result = buft_map.emplace(tensor_split_arr, buft); - return &result.first->second; -} - -// host buffer type - -static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_SYCL_NAME "_Host"; - - GGML_UNUSED(buft); -} - -static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_sycl_host_free(buffer->context); -} - -static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr = ggml_sycl_host_malloc(size); - - if (ptr == nullptr) { - // fallback to cpu buffer - return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - } - - // FIXME: this is a hack to avoid having to implement a new buffer type - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer; - - return buffer; -} - -ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { - GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n"); - static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_sycl_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_sycl_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), 0), - /* .context = */ nullptr, - }; - - return &ggml_backend_sycl_buffer_type_host; -} - -// buffer pool for sycl (legacy) -struct ggml_sycl_pool_leg : public ggml_sycl_pool { - static const int MAX_SYCL_BUFFERS = 256; - - int device; - queue_ptr qptr; - struct ggml_sycl_buffer { - void * ptr = nullptr; - size_t size = 0; - }; - - ggml_sycl_buffer buffer_pool[MAX_SYCL_BUFFERS] = {}; - size_t pool_size = 0; - - explicit ggml_sycl_pool_leg(queue_ptr qptr_, int device_) : device(device_), qptr(qptr_) {} - - ~ggml_sycl_pool_leg() { - for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - ggml_sycl_buffer & b = buffer_pool[i]; - if (b.ptr != nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); - pool_size -= b.size; - } - } - GGML_ASSERT(pool_size == 0); - } - - void * alloc(size_t size, size_t * actual_size) override { -#ifdef DEBUG_sycl_MALLOC - int nnz = 0; - size_t max_size = 0; -#endif - size_t best_diff = 1ull << 36; - int ibest = -1; - for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - ggml_sycl_buffer& b = buffer_pool[i]; - if (b.ptr != nullptr) { -#ifdef DEBUG_sycl_MALLOC - ++nnz; - if (b.size > max_size) max_size = b.size; -#endif - if (b.size >= size) { - size_t diff = b.size - size; - if (diff < best_diff) { - best_diff = diff; - ibest = i; - if (!best_diff) { - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - } - } - } - } - if (ibest >= 0) { - ggml_sycl_buffer& b = buffer_pool[ibest]; - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - void * ptr; - size_t look_ahead_size = (size_t) (1.05 * size); - - SYCL_CHECK( - CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( - look_ahead_size, *qptr))); - if (!ptr) { - GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size); - return nullptr; - } - - *actual_size = look_ahead_size; - pool_size += look_ahead_size; - -#ifdef DEBUG_SYCL_MALLOC - GGML_LOG_DEBUG("%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, - (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); -#endif - - // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr); - return ptr; - } - - void free(void * ptr, size_t size) override { - for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - ggml_sycl_buffer& b = buffer_pool[i]; - if (b.ptr == nullptr) { - b.ptr = ptr; - b.size = size; - return; - } - } - GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n"); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr))); - pool_size -= size; - } -}; - -struct ggml_sycl_pool_host : public ggml_sycl_pool { - queue_ptr qptr; - int device; - - inline static int counter{ 0 }; - - struct ggml_sycl_buffer { - void * ptr = nullptr; - size_t size = 0; - }; - - // Set arbitrarly to 64 - static constexpr int MAX_POOL_SIZE{ 64 }; - std::vector buffer_pool = std::vector(MAX_POOL_SIZE); - size_t pool_size = 0; - - explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {} - - ~ggml_sycl_pool_host() { - for (int i = 0; i < MAX_POOL_SIZE; ++i) { - ggml_sycl_buffer & b = buffer_pool[i]; - if (b.ptr != nullptr) { - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr))); - b.ptr = nullptr; - pool_size -= b.size; - b.size = 0; - } - } - counter = 0; - } - - void * alloc(size_t size, size_t * actual_size) override { - if (counter == MAX_POOL_SIZE) { - ggml_sycl_buffer b = buffer_pool[0]; - void * ptr = b.ptr; - *actual_size = b.size; - counter = 1; - return ptr; - } - ggml_sycl_buffer & b = buffer_pool[counter]; - - if (b.ptr == nullptr) { - void * ptr; - - SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr))); - if (!ptr) { - GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size); - return nullptr; - } - pool_size += size; - *actual_size = size; - counter = counter + 1; - return ptr; - } else { - ++counter; - b.size = size; - return b.ptr; - } - } - - void free(void * ptr, size_t size) override { - // if the pool is not completed add the pointer to it in place of the first nullptr found. - // Otherwise do nothing, pointers will be freed once the pool is deallocated. - for (int i = 0; i < MAX_POOL_SIZE; ++i) { - ggml_sycl_buffer & b = buffer_pool[i]; - if (b.ptr == nullptr) { - b.ptr = ptr; - b.size = size; - return; - } - } - } -}; - -std::unique_ptr ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) { - // return pool for the host to speed up memory management - return std::unique_ptr(new ggml_sycl_pool_host(qptr, device)); -} - -std::unique_ptr ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) { - // TBD: NO VMM support - // if (ggml_sycl_info().devices[device].vmm) { - // return std::unique_ptr(new ggml_sycl_pool_vmm(device)); - // } - return std::unique_ptr(new ggml_sycl_pool_leg(qptr, device)); -} - -// TBD pool with virtual memory management -// struct ggml_sycl_pool_vmm : public ggml_sycl_pool - -/// kernels -typedef void (*ggml_sycl_op_mul_mat_t)( - ggml_backend_sycl_context & ctx, - const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_row_size, - const queue_ptr &stream); - - - -static void mul_mat_p021_f16_f32( - const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, - const sycl::nd_item<3> &item_ct1) { - - const sycl::half *x = (const sycl::half *)vx; - - const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0); - const int channel_x = channel / (nchannels_y / nchannels_x); - - const int nrows_y = ncols_x; - const int nrows_dst = nrows_x; - const int row_dst = row_x; - - float tmp = 0.0f; - - for (int col_x0 = 0; col_x0 < ncols_x; - col_x0 += item_ct1.get_local_range(2)) { - const int col_x = col_x0 + item_ct1.get_local_id(2); - - if (col_x >= ncols_x) { - break; - } - - // x is transposed and permuted - const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; - const float xi = - sycl::vec(x[ix]) - .convert()[0]; - - const int row_y = col_x; - - - // y is not transposed but permuted - const int iy = channel*nrows_y + row_y; - - tmp += xi * y[iy]; - } - - // dst is not transposed and not permuted - const int idst = channel*nrows_dst + row_dst; - - // sum up partial sums and write back result -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[idst] = tmp; - } -} - -static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous - const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, - const int row_stride_x, const int channel_stride_x,const int channel_stride_y, const int channel_x_divisor, - const sycl::nd_item<3> &item_ct1) { - - const sycl::half *x = (const sycl::half *)vx; - - const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) + - item_ct1.get_local_id(0); - const int channel_x = channel / channel_x_divisor; - - const int nrows_dst = nrows_x; - const int row_dst = row_x; - - const int idst = channel*nrows_dst + row_dst; - - float tmp = 0.0f; - - for (int col_x0 = 0; col_x0 < ncols_x; - col_x0 += item_ct1.get_local_range(2)) { - const int col_x = col_x0 + item_ct1.get_local_id(2); - - if (col_x >= ncols_x) { - break; - } - - const int row_y = col_x; - - const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; - const int iy = channel * channel_stride_y + row_y; - - const float xi = - sycl::vec(x[ix]) - .convert()[0]; - - tmp += xi * y[iy]; - } - - // sum up partial sums and write back result -#pragma unroll - for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { - tmp += - dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); - } - - if (item_ct1.get_local_id(2) == 0) { - dst[idst] = tmp; - } -} - -static void k_sum_rows_f32(const float * x, float * dst, const int ncols, - const sycl::nd_item<3> &item_ct1) { - const int row = item_ct1.get_group(1); - const int col = item_ct1.get_local_id(2); - - float sum = 0.0f; - for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) { - sum += x[row * ncols + i]; - } - - sum = warp_reduce_sum(sum, item_ct1); - - if (col == 0) { - dst[row] = sum; - } -} - - -template -static inline void ggml_sycl_swap(T & a, T & b) { - T tmp = a; - a = b; - b = tmp; -} - -template -__dpct_inline__ static void -k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad, - const int tasks_per_thread, const sycl::nd_item<3> &item_ct1, - uint8_t *dpct_local) { - // bitonic sort - int col_index = item_ct1.get_local_id(2); - int row = item_ct1.get_group(1); - - for (int i = 0; i < tasks_per_thread; i++) { - int col = col_index * tasks_per_thread + i; - if (col >= ncols_pad) { - return; - } - } - - const float * x_row = x + row * ncols; - auto dst_row = (int *)dpct_local; - - // initialize indices - for (int i=0;i 0; j /= 2) { - for (int i = 0; i < tasks_per_thread; i++) { - int col = col_index * tasks_per_thread + i; - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (dst_row[col] >= ncols || - (dst_row[ixj] < ncols && - (order == GGML_SORT_ORDER_ASC - ? x_row[dst_row[col]] > x_row[dst_row[ixj]] - : x_row[dst_row[col]] < - x_row[dst_row[ixj]]))) { - ggml_sycl_swap(dst_row[col], dst_row[ixj]); - } - } else { - if (dst_row[ixj] >= ncols || - (dst_row[col] < ncols && - (order == GGML_SORT_ORDER_ASC - ? x_row[dst_row[col]] < x_row[dst_row[ixj]] - : x_row[dst_row[col]] > - x_row[dst_row[ixj]]))) { - ggml_sycl_swap(dst_row[col], dst_row[ixj]); - } - } - } - item_ct1.barrier(sycl::access::fence_space::local_space); - } - } - } - - // copy the result to dst without the padding - for (int i = 0; i < tasks_per_thread; i++) { - int col = col_index * tasks_per_thread + i; - if (col < ncols) { - dst[row * ncols + col] = dst_row[col]; - } - } -} - -static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, - const sycl::nd_item<3> &item_ct1) { - const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + - item_ct1.get_local_id(1); - const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (col >= ncols) { - return; - } - - const int i = row*ncols + col; - //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; - //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU - dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; -} - -static void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k, - const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - item_ct1.get_local_id(2); - - if (i >= k) { - return; - } - - dst[i] = scale * x[i] + bias; -} - - -template -static void pool2d_nchw_kernel( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const Ti* src, To* dst, const enum ggml_op_pool op, - const sycl::nd_item<3> &item_ct1) { - int idx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (idx >= parallel_elements) { - return; - } - - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = sycl::max(0, start_h); - const int eh = sycl::min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = sycl::max(0, start_w); - const int ew = sycl::min(iw, start_w + kw); - - To res = 0; - - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { -#if DPCT_COMPATIBILITY_TEMP >= 350 - /* - DPCT1098:106: The '*' expression is used instead of the __ldg - call. These two expressions do not provide the exact same - functionality. Check the generated code for potential precision - and/or performance issues. - */ - Ti cur = *(i_ptr + i * iw + j); -#else - Ti cur = i_ptr[i * iw + j]; -#endif - switch (op) { - case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; - case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - } - } - o_ptr[cur_oh * ow + cur_ow] = res; -} - - -static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, - float *dst, const int ncols_x, - const int nrows_x, - const int nchannels_x, - const int nchannels_y, - queue_ptr stream) { - - const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, - nchannels_y, item_ct1); - }); - } -} - -static void ggml_mul_mat_vec_nc_f16_f32_sycl( - const void *vx, const float *y, float *dst, const int ncols_x, - const int nrows_x, const int row_stride_x, const int nchannels_x, - const int nchannels_y, const int channel_stride_x, const int channel_stride_y, queue_ptr stream) { - - const sycl::range<3> block_nums(nchannels_y, nrows_x, 1); - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - { - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, - row_stride_x, channel_stride_x, channel_stride_y, - nchannels_y / nchannels_x, item_ct1); - }); - } -} - - - -static void scale_f32_sycl(const float *x, float *dst, const float scale, const float bias, - const int k, queue_ptr stream) { - const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE; - stream->parallel_for( - sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * - sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - scale_f32(x, dst, scale, bias, k, item_ct1); - }); -} - - -static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, - const int nrows, queue_ptr stream) { - const sycl::range<3> block_dims(1, 1, WARP_SIZE); - const sycl::range<3> block_nums(1, nrows, 1); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - k_sum_rows_f32(x, dst, ncols, item_ct1); - }); -} - -static int next_power_of_2(int x) { - int n = 1; - while (n < x) { - n *= 2; - } - return n; -} - -static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, - const int nrows, ggml_sort_order order, - queue_ptr stream, int device) { - // bitonic sort requires ncols to be power of 2 - const int ncols_pad = next_power_of_2(ncols); - - int nth = 1; - int max_block_size = ggml_sycl_info().max_work_group_sizes[device]; - while (nth < ncols_pad && nth < max_block_size) - nth *= 2; - if (nth > max_block_size) - nth = max_block_size; - - const int tasks_per_thread = ncols_pad / nth; - - const sycl::range<3> block_dims(1, 1, nth); - const sycl::range<3> block_nums(1, nrows, 1); - const size_t shared_mem = ncols_pad * sizeof(int); - - if (order == GGML_SORT_ORDER_ASC) { - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor dpct_local_acc_ct1( - sycl::range<1>(shared_mem), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32( - x, dst, ncols, ncols_pad, tasks_per_thread, item_ct1, - dpct_local_acc_ct1 - .get_multi_ptr() - .get()); - }); - }); - } else if (order == GGML_SORT_ORDER_DESC) { - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor dpct_local_acc_ct1( - sycl::range<1>(shared_mem), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32( - x, dst, ncols, ncols_pad, tasks_per_thread, item_ct1, - dpct_local_acc_ct1 - .get_multi_ptr() - .get()); - }); - }); - } else { - GGML_ABORT("fatal error"); - } -} - -static void argmax_f32_i32_sycl(const float *x, int *dst, const int ncols, - const int nrows, queue_ptr stream) { - const sycl::range<3> block_dims(1, 1, SYCL_ARGMAX_BLOCK_SIZE); - const sycl::range<3> block_nums(1, nrows, 1); - const size_t shared_mem = 256 * sizeof(float); - - stream->submit([&](sycl::handler &cgh) { - sycl::local_accessor shared_data( - sycl::range<1>(shared_mem/sizeof(float)), cgh); - sycl::local_accessor shared_indices( - sycl::range<1>(shared_mem/sizeof(float)), cgh); - - cgh.parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - const int tid = item_ct1.get_local_id(2); - const int row = item_ct1.get_global_id(1); - - float max_val = -INFINITY; - int max_idx = -1; - - for (int col = tid; col < ncols; col += 256) { - float val = x[row * ncols + col]; - if (val > max_val) { - max_val = val; - max_idx = col; - } - } - - shared_data[tid] = max_val; - shared_indices[tid] = max_idx; - item_ct1.barrier(sycl::access::fence_space::local_space); - - for (int stride = 256/2; stride > 0; stride >>= 1) { - if (tid < stride) { - float val1 = shared_data[tid]; - float val2 = shared_data[tid + stride]; - if (val2 > val1) { - shared_data[tid] = val2; - shared_indices[tid] = shared_indices[tid + stride]; - } - } - item_ct1.barrier(sycl::access::fence_space::local_space); - } - - - if (tid == 0) { - dst[row] = shared_indices[0]; - } - }); - }); -} -static void diag_mask_inf_f32_sycl(const float *x, float *dst, - const int ncols_x, const int nrows_x, - const int rows_per_channel, const int n_past, - queue_ptr stream) { - const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1); - const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE; - const sycl::range<3> block_nums(1, block_num_x, nrows_x); - stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - diag_mask_inf_f32(x, dst, ncols_x, - rows_per_channel, n_past, - item_ct1); - }); -} - -static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, - const struct ggml_tensor *src, - int64_t i3, int64_t i2, - int64_t i1_low, int64_t i1_high, - queue_ptr stream) try { - - dpct::memcpy_direction kind; - char * src_ptr; - if (ggml_backend_buffer_is_host(src->buffer)) { - kind = dpct::host_to_device; - //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__); - src_ptr = (char *) src->data; - // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); - } else if (ggml_backend_buffer_is_sycl(src->buffer)) { - // If buffer is a SYCL buffer - //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__); - kind = dpct::device_to_device; - src_ptr = (char *) src->data; - } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) { - /* - If buffer is a SYCL split buffer - */ - //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__); - GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]); - kind = dpct::device_to_device; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; - int id; - SYCL_CHECK(CHECK_TRY_ERROR( - id = get_current_device_id())); - // GGML_SYCL_DEBUG("current device index %d\n", id); - src_ptr = (char *) extra->data_device[id]; - } else { - // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n"); - GGML_ABORT("fatal error"); - } - char * dst_ptr = (char *) dst; - - GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne); - GGML_TENSOR_LOCALS(int64_t, nb, src, nb); - const enum ggml_type type = src->type; - const int64_t ts = ggml_type_size(type); - const int64_t bs = ggml_blck_size(type); - int64_t i1_diff = i1_high - i1_low; - - const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; - if (nb0 == ts && nb1 == ts*ne0/bs) { - // GGML_SYCL_DEBUG("stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1); - // return CHECK_TRY_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1)); - return CHECK_TRY_ERROR(dpct::async_dpct_memcpy(dst_ptr, x, i1_diff * nb1, - kind, *stream)); - - } else if (nb0 == ts) { - return CHECK_TRY_ERROR( - dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1, - ts * ne0 / bs, i1_diff, kind, *stream)); - } else { - for (int64_t i1 = 0; i1 < i1_diff; i1++) { - const void * rx = (const void *) ((const char *) x + i1*nb1); - void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); - // pretend the row is a matrix with cols=1 - dpct::err0 r = CHECK_TRY_ERROR(dpct::async_dpct_memcpy( - rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream)); - /* - DPCT1001:85: The statement could not be removed. - */ - /* - DPCT1000:86: Error handling if-stmt was detected but could not be - rewritten. - */ - if (r != 0) return r; - } - return 0; - } -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -inline void ggml_sycl_op_mul_mat_sycl( - ggml_backend_sycl_context & ctx, - const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, - const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i, - float *dst_dd_i, const int64_t row_low, const int64_t row_high, - const int64_t src1_ncols, const int64_t src1_padded_row_size, - const queue_ptr &stream) try { - - GGML_ASSERT(src0_dd_i != nullptr); - GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_dd_i != nullptr); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne10 = src1->ne[0]; - GGML_ASSERT(ne00 == ne10); - - const int64_t row_diff = row_high - row_low; - - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_id())); - - const int64_t ne0 = dst->ne[0]; // used by MKL only - // the main device has a larger memory buffer to hold the results from all GPUs - // ldc == nrows of the matrix that cuBLAS writes into - int ldc = id == ctx.device ? ne0 : row_diff; // used by MKL only - -#ifdef GGML_SYCL_F16 - bool use_fp16 = true; // TODO(Yu) SYCL capability check -#else - bool use_fp16 = false; -#endif - if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) && - row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { - ggml_sycl_pool_alloc src0_as_f16(ctx.pool()); - if (src0->type != GGML_TYPE_F16) { - scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2, - " : converting src0 to fp16"); - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type, dst); - GGML_ASSERT(to_fp16_sycl != nullptr); - size_t ne = row_diff*ne00; - src0_as_f16.alloc(ne); - to_fp16_sycl(src0_dd_i, src0_as_f16.get(), ne, stream); - } - const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16 - ? (const sycl::half *)src0_dd_i - : src0_as_f16.get(); - - ggml_sycl_pool_alloc src1_as_f16(ctx.pool()); - if (src1->type != GGML_TYPE_F16) { - scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2, - " : converting src1 to fp16"); - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst); - GGML_ASSERT(to_fp16_sycl != nullptr); - size_t ne = src1_ncols*ne10; - src1_as_f16.alloc(ne); - to_fp16_sycl(src1_ddf_i, src1_as_f16.get(), ne, stream); - } - const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16 - ? (const sycl::half *)src1->data + src1_padded_row_size - : src1_as_f16.get(); - -#if GGML_SYCL_DNNL - if (!g_ggml_sycl_disable_dnn) { - DnnlGemmWrapper::row_gemm(ctx,row_diff, src1_ncols , ne10, src0_ptr, - DnnlGemmWrapper::to_dt(), src1_ptr, DnnlGemmWrapper::to_dt(), - dst_dd_i, DnnlGemmWrapper::to_dt(), stream); - } - else -#endif - { - ggml_sycl_pool_alloc dst_f16(ctx.pool(), row_diff * src1_ncols); - - const sycl::half alpha_f16 = 1.0f; - const sycl::half beta_f16 = 0.0f; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( - *stream, oneapi::math::transpose::trans, - oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10, - &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, - src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, - dst_f16.get(), dpct::library_data_t::real_half, ldc, - dpct::library_data_t::real_half))); - scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, - " : converting dst to fp32"); - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst); - to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream); - } - } else { - ggml_sycl_pool_alloc src0_ddq_as_f32(ctx.pool()); - ggml_sycl_pool_alloc src1_ddq_as_f32(ctx.pool()); - if (src0->type != GGML_TYPE_F32) { - scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, - " : converting src0 to fp32"); - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst); - GGML_ASSERT(to_fp32_sycl != nullptr); - src0_ddq_as_f32.alloc(row_diff*ne00); - to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream); - } - if (src1->type != GGML_TYPE_F32) { - scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, - " : converting src1 to fp32"); - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type, dst); - GGML_ASSERT(to_fp32_sycl != nullptr); - src1_ddq_as_f32.alloc(src1_ncols*ne10); - to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream); - } - const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); - const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get(); - -#if GGML_SYCL_DNNL - if (!g_ggml_sycl_disable_dnn) { - DnnlGemmWrapper::row_gemm(ctx, row_diff, src1_ncols, ne10, src0_ddf_i, - DnnlGemmWrapper::to_dt(), src1_ddf1_i, DnnlGemmWrapper::to_dt(), - dst_dd_i, DnnlGemmWrapper::to_dt(), stream); - } - else -#endif - { - const float alpha = 1.0f; - const float beta = 0.0f; - SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm( - get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff, - src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, - dpct::get_value(&beta, *stream), dst_dd_i, ldc))); - } - } - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddq_i); - GGML_UNUSED(src1_padded_row_size); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - const int64_t IH = dst->src[0]->ne[1]; - const int64_t IW = dst->src[0]->ne[0]; - - const int64_t N = dst->ne[3]; - const int64_t OC = dst->ne[2]; - const int64_t OH = dst->ne[1]; - const int64_t OW = dst->ne[0]; - - const int parallel_elements = N * OC * OH * OW; - const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; - sycl::range<3> block_nums(1, 1, num_blocks); - main_stream->parallel_for( - sycl::nd_range<3>(block_nums * - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, - parallel_elements, src0_dd, dst_dd, op, - item_ct1); - }); -} - -inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - const int64_t ne = ggml_nelements(dst->src[0]); - - sum_rows_f32_sycl(src0_dd, dst_dd, ne, 1, main_stream); -} - -inline void ggml_sycl_op_sum_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - - sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); -} - -inline void ggml_sycl_op_mean(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - - sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); - - main_stream->parallel_for( - sycl::range<1>(nrows), - [=](sycl::id<1> row) { - dst_dd[row] /= ncols; - } - ); -} - - -inline void ggml_sycl_op_argsort(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_I32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - int32_t * dst_dd = static_cast(dst->data); - - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - - argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, - main_stream, ctx.device); -} - -inline void ggml_sycl_op_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - int32_t * dst_dd = static_cast(dst->data); - - const int64_t ncols = dst->src[0]->ne[0]; - const int64_t nrows = ggml_nrows(dst->src[0]); - - argmax_f32_i32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream); -} - -inline void ggml_sycl_op_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - const int64_t ne00 = dst->src[0]->ne[0]; - const int64_t ne01 = dst->src[0]->ne[1]; - const int nrows0 = ggml_nrows(dst->src[0]); - - const int n_past = ((int32_t *) dst->op_params)[0]; - - diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); -} - -inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - float scale; - float bias; - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&bias, (float *) dst->op_params + 1, sizeof(float)); - - scale_f32_sycl(src0_dd, dst_dd, scale, bias, ggml_nelements(dst->src[0]), main_stream); - /* - DPCT1010:87: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this code. - */ - SYCL_CHECK(0); -} - -static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) { - static bool peer_access_enabled = false; - - const bool enable_peer_access = n_tokens <= GGML_SYCL_PEER_MAX_BATCH_SIZE; - - if (peer_access_enabled == enable_peer_access) { - return; - } - -#ifdef NDEBUG - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); - } - - for (int i = 0; i < ggml_sycl_info().device_count; ++i) { - SYCL_CHECK(ggml_sycl_set_device(i)); - - for (int id_other = 0; id_other < ggml_sycl_info().device_count; ++id_other) { - if (i == id_other) { - continue; - } - if (i != main_device && id_other != main_device) { - continue; - } - - // int can_access_peer; - // SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other)); - // if (can_access_peer) { - // if (enable_peer_access) { - // SYCL_CHECK(syclDeviceEnablePeerAccess(id_other, 0)); - // } else { - // SYCL_CHECK(syclDeviceDisablePeerAccess(id_other)); - // } - // } - } - } -#endif // NDEBUG - - peer_access_enabled = enable_peer_access; -} - -template