From de3c74f2bd9f984b4f71c56446d9f96eefb90dcc Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Tue, 18 Nov 2025 08:56:56 -0500 Subject: [PATCH 1/2] Adding native code related to (#138204) --- libs/simdvec/native/src/vec/c/aarch64/vec.c | 25 ++++++++++++++++++ libs/simdvec/native/src/vec/c/amd64/vec.c | 25 ++++++++++++++++++ libs/simdvec/native/src/vec/c/amd64/vec_2.cpp | 26 +++++++++++++++++++ libs/simdvec/native/src/vec/headers/vec.h | 2 ++ 4 files changed, 78 insertions(+) diff --git a/libs/simdvec/native/src/vec/c/aarch64/vec.c b/libs/simdvec/native/src/vec/c/aarch64/vec.c index f3eb7f51ee5d1..a231d41211a5b 100644 --- a/libs/simdvec/native/src/vec/c/aarch64/vec.c +++ b/libs/simdvec/native/src/vec/c/aarch64/vec.c @@ -95,6 +95,31 @@ EXPORT int32_t dot7u(int8_t* a, int8_t* b, size_t dims) { return res; } +EXPORT void dot7u_bulk(int8_t* a, int8_t* b, size_t dims, size_t count, float_t* results) { + int32_t res = 0; + if (dims > DOT7U_STRIDE_BYTES_LEN) { + int limit = dims & ~(DOT7U_STRIDE_BYTES_LEN - 1); + for (size_t c = 0; c < count; c++) { + int i = limit; + res = dot7u_inner(a, b, i); + for (; i < dims; i++) { + res += a[i] * b[i]; + } + results[c] = (float_t)res; + a += dims; + } + } else { + for (size_t c = 0; c < count; c++) { + res = 0; + for (size_t i = 0; i < dims; i++) { + res += a[i] * b[i]; + } + results[c] = (float_t)res; + a += dims; + } + } +} + static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) { int32x4_t acc1 = vdupq_n_s32(0); int32x4_t acc2 = vdupq_n_s32(0); diff --git a/libs/simdvec/native/src/vec/c/amd64/vec.c b/libs/simdvec/native/src/vec/c/amd64/vec.c index c6b9154b60660..7b7bcf45fef52 100644 --- a/libs/simdvec/native/src/vec/c/amd64/vec.c +++ b/libs/simdvec/native/src/vec/c/amd64/vec.c @@ -153,6 +153,31 @@ EXPORT int32_t dot7u(int8_t* a, int8_t* b, size_t dims) { return res; } +EXPORT void dot7u_bulk(int8_t* a, int8_t* b, size_t dims, size_t count, float_t* results) { + int32_t res = 0; + if (dims > STRIDE_BYTES_LEN) { + int limit = dims & ~(STRIDE_BYTES_LEN - 1); + for (size_t c = 0; c < count; c++) { + int i = limit; + res = dot7u_inner(a, b, i); + for (; i < dims; i++) { + res += a[i] * b[i]; + } + results[c] = (float_t)res; + a += dims; + } + } else { + for (size_t c = 0; c < count; c++) { + res = 0; + for (size_t i = 0; i < dims; i++) { + res += a[i] * b[i]; + } + results[c] = (float_t)res; + a += dims; + } + } +} + static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, size_t dims) { // Init accumulator(s) with 0 __m256i acc1 = _mm256_setzero_si256(); diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp index dd062f8210c3c..3de04001d4c45 100644 --- a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp +++ b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp @@ -114,6 +114,32 @@ EXPORT int32_t dot7u_2(int8_t* a, int8_t* b, size_t dims) { return res; } +extern "C" +EXPORT void dot7u_bulk_2(int8_t* a, int8_t* b, size_t dims, size_t count, float_t* results) { + int32_t res = 0; + if (dims > STRIDE_BYTES_LEN) { + int limit = dims & ~(STRIDE_BYTES_LEN - 1); + for (size_t c = 0; c < count; c++) { + int i = limit; + res = dot7u_inner_avx512(a, b, i); + for (; i < dims; i++) { + res += a[i] * b[i]; + } + results[c] = (float_t)res; + a += dims; + } + } else { + for (size_t c = 0; c < count; c++) { + res = 0; + for (size_t i = 0; i < dims; i++) { + res += a[i] * b[i]; + } + results[c] = (float_t)res; + a += dims; + } + } +} + template inline __m512i sqr8(__m512i acc, const int8_t* p1, const int8_t* p2) { constexpr int lanes = offsetRegs * STRIDE_BYTES_LEN; diff --git a/libs/simdvec/native/src/vec/headers/vec.h b/libs/simdvec/native/src/vec/headers/vec.h index 733aea3165659..eb927bef61566 100644 --- a/libs/simdvec/native/src/vec/headers/vec.h +++ b/libs/simdvec/native/src/vec/headers/vec.h @@ -19,6 +19,8 @@ EXPORT int vec_caps(); EXPORT int32_t dot7u(int8_t* a, int8_t* b, size_t dims); +EXPORT void dot7u_bulk(int8_t* a, int8_t* b, size_t dims, size_t count, float_t* results); + EXPORT int32_t sqr7u(int8_t *a, int8_t *b, size_t length); EXPORT float cosf32(const float *a, const float *b, size_t elementCount); From 66fc25e726ae56d7cfaed551dda3343d5e69eaec Mon Sep 17 00:00:00 2001 From: Lorenzo Dematte Date: Tue, 18 Nov 2025 17:02:56 +0100 Subject: [PATCH 2/2] Bump simdvec native lib build/publish VERSION --- libs/simdvec/native/publish_vec_binaries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/simdvec/native/publish_vec_binaries.sh b/libs/simdvec/native/publish_vec_binaries.sh index 0258ed5760b6b..ae0500b0b6ffa 100755 --- a/libs/simdvec/native/publish_vec_binaries.sh +++ b/libs/simdvec/native/publish_vec_binaries.sh @@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then exit 1; fi -VERSION="1.0.13" +VERSION="1.0.14" ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}" TEMP=$(mktemp -d)