From a8b0089a5b8dd61cc4dfe181d19e3865407d23cf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 28 Sep 2025 19:58:20 +0300 Subject: [PATCH] ggml : remove SVE paths --- ggml/src/ggml-cpu/ops.cpp | 263 +++++------------ ggml/src/ggml-cpu/simd-mappings.h | 163 +---------- ggml/src/ggml-cpu/vec.cpp | 132 +-------- ggml/src/ggml-cpu/vec.h | 459 ++++-------------------------- 4 files changed, 126 insertions(+), 891 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 14f7dcf4f41ad..48a820159a184 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -8646,41 +8646,7 @@ static void ggml_compute_forward_ssm_scan_f32( const int ii = i1 + h*nr; const float x_dt = x[ii] * dt_soft_plus; float sumf = 0.0f; -#if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - const int ggml_f32_epr = svcntw(); - const int ggml_f32_step = 1 * ggml_f32_epr; - - const int np = (nc & ~(ggml_f32_step - 1)); - - GGML_F32_VEC sum = GGML_F32_VEC_ZERO; - - GGML_F32_VEC adA = GGML_F32_VEC_SET1(dA); - GGML_F32_VEC axdt = GGML_F32_VEC_SET1(x_dt); - - for (int i = 0; i < np; i += ggml_f32_step) { - // TODO: maybe unroll more? - for (int j = 0; j < 1; j++) { - GGML_F32_VEC t0 = GGML_F32_VEC_LOAD(s0 + i + j*ggml_f32_epr + ii*nc); - GGML_F32_VEC t1 = GGML_F32_VEC_LOAD(B + i + j*ggml_f32_epr + g*nc); - GGML_F32_VEC t2 = GGML_F32_VEC_LOAD(C + i + j*ggml_f32_epr + g*nc); - - t0 = GGML_F32_VEC_MUL(t0, adA); - t1 = GGML_F32_VEC_MUL(t1, axdt); - - t0 = GGML_F32_VEC_ADD(t0, t1); - - sum = GGML_F32_VEC_FMA(sum, t0, t2); - - GGML_F32_VEC_STORE(s + i + j*ggml_f32_epr + ii*nc, t0); - } - } - - sumf = GGML_F32xt_REDUCE_ONE(sum); - #elif defined(__riscv_v_intrinsic) - // todo: RVV implementation - const int np = 0; - #else +#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic) const int np = (nc & ~(GGML_F32_STEP - 1)); GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; @@ -8711,7 +8677,6 @@ static void ggml_compute_forward_ssm_scan_f32( // reduce sum0..sum3 to sum0 GGML_F32_VEC_REDUCE(sumf, sum); - #endif #else const int np = 0; #endif @@ -8741,30 +8706,6 @@ static void ggml_compute_forward_ssm_scan_f32( for (int i1 = 0; i1 < nr; ++i1) { const int ii = i1 + h*nr; const float x_dt = x[ii] * dt_soft_plus; -#if defined(__ARM_FEATURE_SVE) - svfloat32_t vx_dt = GGML_F32_VEC_SET1(x_dt); - svfloat32_t vdt_soft_plus = GGML_F32_VEC_SET1(dt_soft_plus); - svfloat32_t r1_vector = GGML_F32_VEC_ZERO; - - // d_state - // TODO: what happens when (d_state % svcntw()) != 0? - for (int64_t k = 0; k < nc; k += svcntw()) { - svfloat32_t vA = GGML_F32_VEC_LOAD(&A[h*nc + k]); - svfloat32_t vB = GGML_F32_VEC_LOAD(&B[k + g*nc]); - svfloat32_t vC = GGML_F32_VEC_LOAD(&C[k + g*nc]); - svfloat32_t vs0 = GGML_F32_VEC_LOAD(&s0[ii*nc + k]); - - svfloat32_t t1 = GGML_F32_VEC_MUL(vdt_soft_plus, vA); - t1 = exp_ps_sve(svptrue_b32(), t1); - svfloat32_t t2 = GGML_F32_VEC_MUL(vx_dt, vB); - - vs0 = GGML_F32_VEC_FMA(t2, vs0, t1); - r1_vector = GGML_F32_VEC_ADD(GGML_F32_VEC_MUL(vs0, vC), r1_vector); - - GGML_F32_VEC_STORE(&s[ii*nc + k], vs0); - } - y[ii] = GGML_F32xt_REDUCE_ONE(r1_vector); -#else float sumf = 0.0f; // NOTE: can't really use GGML_SIMD here because d_state is usually 16 // and also because expf is used within the loop. @@ -8779,7 +8720,6 @@ static void ggml_compute_forward_ssm_scan_f32( s[i] = state; } y[ii] = sumf; -#endif } } } @@ -9231,14 +9171,6 @@ static void ggml_compute_forward_rwkv_wkv6_f32( #define GGML_F32X_MUL GGML_F32x16_MUL #define GGML_F32X_FMA GGML_F32x16_FMA #define WKV_VECTOR_SIZE 16 - #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) - #define GGML_F32X GGML_F32xt - #define GGML_F32X_SET1 GGML_F32xt_SET1 - #define GGML_F32X_LOAD GGML_F32xt_LOAD - #define GGML_F32X_STORE GGML_F32xt_STORE - #define GGML_F32X_MUL GGML_F32xt_MUL - #define GGML_F32X_FMA GGML_F32xt_FMA - #define WKV_VECTOR_SIZE 8 #elif defined(__ARM_NEON) && defined(__aarch64__) #define GGML_F32X GGML_F32x4 #define GGML_F32X_SET1 GGML_F32x4_SET1 @@ -9251,11 +9183,7 @@ static void ggml_compute_forward_rwkv_wkv6_f32( #ifdef WKV_VECTOR_SIZE int wkv_vector_size; - #if defined(__ARM_FEATURE_SVE) - wkv_vector_size = svcntw(); - #else - wkv_vector_size = WKV_VECTOR_SIZE; - #endif + wkv_vector_size = WKV_VECTOR_SIZE; const int64_t vec_count = head_size / wkv_vector_size; for (int64_t t = 0; t < T; t++) { @@ -9447,14 +9375,6 @@ static void ggml_compute_forward_gla_f32( #define GGML_F32X_MUL GGML_F32x16_MUL #define GGML_F32X_FMA GGML_F32x16_FMA #define GLA_VECTOR_SIZE 16 - #elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) - #define GGML_F32X GGML_F32xt - #define GGML_F32X_SET1 GGML_F32xt_SET1 - #define GGML_F32X_LOAD GGML_F32xt_LOAD - #define GGML_F32X_STORE GGML_F32xt_STORE - #define GGML_F32X_MUL GGML_F32xt_MUL - #define GGML_F32X_FMA GGML_F32xt_FMA - #define GLA_VECTOR_SIZE 8 #elif defined(__ARM_NEON) && defined(__aarch64__) #define GGML_F32X GGML_F32x4 #define GGML_F32X_SET1 GGML_F32x4_SET1 @@ -9467,11 +9387,7 @@ static void ggml_compute_forward_gla_f32( #ifdef GLA_VECTOR_SIZE int gla_vector_size; - #if defined(__ARM_FEATURE_SVE) - gla_vector_size = svcntw(); - #else - gla_vector_size = GLA_VECTOR_SIZE; - #endif + gla_vector_size = GLA_VECTOR_SIZE; const int64_t vec_count = head_size / gla_vector_size; for (int64_t t = 0; t < T; t++) { @@ -9631,127 +9547,84 @@ static void ggml_compute_forward_rwkv_wkv7_f32( GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS int64_t h_stride_2d = head_size * head_size; - #if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) || defined(__riscv_v_intrinsic) - // scalar Route to scalar implementation //TODO: Write SVE code and RVV code - for (int64_t t = 0; t < T; t++) { - int64_t t_offset = t * t_stride; - int64_t state_offset = head_size * C * (t / (T / n_seqs)); - float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; - - for (int64_t h = h_start; h < h_end; h++) { - int64_t h_offset = h * h_stride; - int64_t t_h_offset = t_offset + h_offset; - int64_t h_2d_offset = h * h_stride_2d; - - for (int64_t i = 0; i < head_size; i++) { - int64_t t_h_i_offset = t_h_offset + i; - int64_t h_2d_i_offset = h_2d_offset + i * h_stride; - - float v_val = v[t_h_i_offset]; - - float sa = 0, result = 0; - for (int64_t j = 0; j < head_size; j++) { - sa += a[t_h_offset + j] * state_prev[h_2d_i_offset + j]; - } + #if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic) + for (int64_t t = 0; t < T; t++) { + int64_t t_offset = t * t_stride; + int64_t state_offset = head_size * C * (t / (T / n_seqs)); + float * state_cur = state + state_offset; + float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; - for (int64_t j = 0; j < head_size; j++) { - int64_t t_h_j_offset = t_h_offset + j; - int64_t h_2d_i_j_offset = h_2d_i_offset + j; - - float r_val = r[t_h_j_offset]; - float w_val = w[t_h_j_offset]; - float k_val = k[t_h_j_offset]; - float b_val = b[t_h_j_offset]; - float kv_val = v_val * k_val; - float prev_state_val = state_prev[h_2d_i_j_offset]; - state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; - result += state_cur[h_2d_i_j_offset] * r_val; - } - dst_data[t_h_i_offset] = result; - } - } - } - #else - for (int64_t t = 0; t < T; t++) { - int64_t t_offset = t * t_stride; - int64_t state_offset = head_size * C * (t / (T / n_seqs)); - float * state_cur = state + state_offset; - float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[6]->data + state_offset; - - for (int64_t h = h_start; h < h_end; h++) { - int64_t h_offset = h * h_stride; - int64_t t_h_offset = t_offset + h_offset; - int64_t h_2d_offset = h * h_stride_2d; - - for (int64_t ii = 0; ii < head_size; ii++) { - int64_t t_h_i_offset = t_h_offset + ii; - int64_t h_2d_i_offset = h_2d_offset + ii * h_stride; - - GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]); - - float sa = 0; - { - GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; - GGML_F32_VEC ax[GGML_F32_ARR]; - GGML_F32_VEC ay[GGML_F32_ARR]; - for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) { - for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { - ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]); - ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]); - sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]); - } + for (int64_t h = h_start; h < h_end; h++) { + int64_t h_offset = h * h_stride; + int64_t t_h_offset = t_offset + h_offset; + int64_t h_2d_offset = h * h_stride_2d; + + for (int64_t ii = 0; ii < head_size; ii++) { + int64_t t_h_i_offset = t_h_offset + ii; + int64_t h_2d_i_offset = h_2d_offset + ii * h_stride; + + GGML_F32_VEC v_vec = GGML_F32_VEC_SET1(v[t_h_i_offset]); + + float sa = 0; + { + GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + GGML_F32_VEC ax[GGML_F32_ARR]; + GGML_F32_VEC ay[GGML_F32_ARR]; + for (int64_t j = 0; j < head_size; j += GGML_F32_STEP) { + for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { + ax[kk] = GGML_F32_VEC_LOAD(&a[t_h_offset + j + kk * GGML_F32_EPR]); + ay[kk] = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_offset + j + kk * GGML_F32_EPR]); + sum[kk] = GGML_F32_VEC_FMA(sum[kk], ax[kk], ay[kk]); } - GGML_F32_VEC_REDUCE(sa, sum); } + GGML_F32_VEC_REDUCE(sa, sum); + } - GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa); + GGML_F32_VEC sa_vec = GGML_F32_VEC_SET1(sa); - int64_t j = 0; - GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; - for (; j < head_size; j += GGML_F32_STEP) { - for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { - int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR; - int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR; + int64_t j = 0; + GGML_F32_VEC result_vec[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; + for (; j < head_size; j += GGML_F32_STEP) { + for (int64_t kk = 0; kk < GGML_F32_ARR; kk++) { + int64_t t_h_j_offset = t_h_offset + j + kk * GGML_F32_EPR; + int64_t h_2d_i_j_offset = h_2d_i_offset + j + kk * GGML_F32_EPR; - GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]); - GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]); - GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]); - GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]); + GGML_F32_VEC r_vec = GGML_F32_VEC_LOAD(&r[t_h_j_offset]); + GGML_F32_VEC w_vec = GGML_F32_VEC_LOAD(&w[t_h_j_offset]); + GGML_F32_VEC k_vec = GGML_F32_VEC_LOAD(&k[t_h_j_offset]); + GGML_F32_VEC b_vec = GGML_F32_VEC_LOAD(&b[t_h_j_offset]); - k_vec = GGML_F32_VEC_MUL(v_vec, k_vec); + k_vec = GGML_F32_VEC_MUL(v_vec, k_vec); - GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]); - // kv + s * decay + sa * b - state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec); - state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec); - GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec); + GGML_F32_VEC state_vec = GGML_F32_VEC_LOAD(&state_prev[h_2d_i_j_offset]); + // kv + s * decay + sa * b + state_vec = GGML_F32_VEC_FMA(k_vec, state_vec, w_vec); + state_vec = GGML_F32_VEC_FMA(state_vec, sa_vec, b_vec); + GGML_F32_VEC_STORE(&state_cur[h_2d_i_j_offset], state_vec); - result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec); - } - } - GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec); - - // There shouldn't be left-overs though. - for (; j < head_size; j++) { - int64_t t_h_j_offset = t_h_offset + j; - int64_t h_2d_i_j_offset = h_2d_i_offset + j; - - float r_val = r[t_h_j_offset]; - float w_val = w[t_h_j_offset]; - float k_val = k[t_h_j_offset]; - float b_val = b[t_h_j_offset]; - float kv_val = v[t_h_i_offset] * k_val; - - float prev_state_val = state_prev[h_2d_i_j_offset]; - state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; - dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val; + result_vec[kk] = GGML_F32_VEC_FMA(result_vec[kk], state_vec, r_vec); } } + GGML_F32_VEC_REDUCE(dst_data[t_h_i_offset], result_vec); + + // There shouldn't be left-overs though. + for (; j < head_size; j++) { + int64_t t_h_j_offset = t_h_offset + j; + int64_t h_2d_i_j_offset = h_2d_i_offset + j; + + float r_val = r[t_h_j_offset]; + float w_val = w[t_h_j_offset]; + float k_val = k[t_h_j_offset]; + float b_val = b[t_h_j_offset]; + float kv_val = v[t_h_i_offset] * k_val; + + float prev_state_val = state_prev[h_2d_i_j_offset]; + state_cur[h_2d_i_j_offset] = prev_state_val * w_val + kv_val + sa * b_val; + dst_data[t_h_i_offset] += state_cur[h_2d_i_j_offset] * r_val; + } } } - #endif + } #else for (int64_t t = 0; t < T; t++) { int64_t t_offset = t * t_stride; diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 8daec6637b085..0bc44e17ba655 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -2,10 +2,6 @@ #include "ggml-cpu-impl.h" -#ifdef __ARM_FEATURE_SVE -#include -#endif // __ARM_FEATURE_SVE - #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) // if YCM cannot find , make a symbolic link to it, for example: // @@ -149,164 +145,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { // number of elements to fit in a single register // -#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA) - -#define GGML_SIMD - -// F32 SVE -#define GGML_F32_EPR 8 -#define DEFAULT_PG svptrue_b32() - -#define GGML_F32xt svfloat32_t -#define GGML_F32xt_ZERO svdup_n_f32(0.0f) -#define GGML_F32xt_SET1(x) svdup_n_f32(x) -#define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a) -#define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b) -#define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a) -#define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b) -#define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b) -#define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a) -#define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__) -#define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \ -{ \ - sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \ - sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \ - sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \ - sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \ - sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \ - sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \ - sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \ - (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \ -} -#define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__) - -#define GGML_F32_VEC GGML_F32xt -#define GGML_F32_VEC_ZERO GGML_F32xt_ZERO -#define GGML_F32_VEC_SET1 GGML_F32xt_SET1 -#define GGML_F32_VEC_LOAD GGML_F32xt_LOAD -#define GGML_F32_VEC_STORE GGML_F32xt_STORE -#define GGML_F32_VEC_FMA GGML_F32xt_FMA -#define GGML_F32_VEC_ADD GGML_F32xt_ADD -#define GGML_F32_VEC_MUL GGML_F32xt_MUL -#define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE - -// F16 SVE -#define DEFAULT_PG32 svptrue_b32() -#define DEFAULT_PG16 svptrue_b16() - -#define GGML_F32Cxt svfloat16_t -#define GGML_F32Cxt_ZERO svdup_n_f16(0.0f) -#define GGML_F32Cxt_SET1(x) svdup_n_f16(x) -#define GGML_F32Cxt_LOAD(p) svld1_f16(DEFAULT_PG16, (const __fp16 *)(p)) -#define GGML_F32Cxt_STORE(dst_ptr, src_vec) svst1_f16(DEFAULT_PG16, (__fp16 *)(dst_ptr), (src_vec)) - -#define GGML_F32Cxt_FMA_IMPL(pg, a, b, c) svmad_f16_x(pg, b, c, a) -#define GGML_F32Cxt_FMA(...) GGML_F32Cxt_FMA_IMPL(DEFAULT_PG16, __VA_ARGS__) -#define GGML_F32Cxt_ADD_IMPL(pg, a, b) svadd_f16_x(pg, a, b) -#define GGML_F32Cxt_ADD(...) GGML_F32Cxt_ADD_IMPL(DEFAULT_PG16, __VA_ARGS__) -#define GGML_F32Cxt_MUL_IMPL(pg, a, b) svmul_f16_x(pg, a, b) -#define GGML_F32Cxt_MUL(...) GGML_F32Cxt_MUL_IMPL(DEFAULT_PG16, __VA_ARGS__) -#define GGML_F32Cxt_REDUCE GGML_F16xt_REDUCE_MIXED - -#define GGML_F16x_VEC GGML_F32Cxt -#define GGML_F16x_VEC_ZERO GGML_F32Cxt_ZERO -#define GGML_F16x_VEC_SET1 GGML_F32Cxt_SET1 -#define GGML_F16x_VEC_LOAD(p, i) GGML_F32Cxt_LOAD(p) -#define GGML_F16x_VEC_STORE(p, r, i) GGML_F32Cxt_STORE((__fp16 *)(p), r) -#define GGML_F16x_VEC_FMA GGML_F32Cxt_FMA -#define GGML_F16x_VEC_ADD GGML_F32Cxt_ADD -#define GGML_F16x_VEC_MUL GGML_F32Cxt_MUL -#define GGML_F16x_VEC_REDUCE GGML_F32Cxt_REDUCE - -#define GGML_F16xt_REDUCE_ONE_IMPL(pg, a) svaddv_f16(pg, a) -#define GGML_F16xt_REDUCE_ONE(...) GGML_F16xt_REDUCE_ONE_IMPL(DEFAULT_PG16, __VA_ARGS__) - -#define GGML_F16xt_REDUCE_MIXED_IMPL(pg16, res, sum1, sum2, sum3, sum4) \ -{ \ - sum1 = svadd_f16_x(pg16, sum1, sum2); \ - sum3 = svadd_f16_x(pg16, sum3, sum4); \ - sum1 = svadd_f16_x(pg16, sum1, sum3); \ - __fp16 sum_f16 = svaddv_f16(pg16, sum1); \ - (res) = (ggml_float) sum_f16; \ -} -#define GGML_F16xt_REDUCE_MIXED(...) GGML_F16xt_REDUCE_MIXED_IMPL(DEFAULT_PG16, __VA_ARGS__) - -// F16 NEON - -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define GGML_F16_STEP 32 - #define GGML_F16_EPR 8 - - #define GGML_F16x8 float16x8_t - #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) - #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) - #define GGML_F16x8_STORE vst1q_f16 - #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) - #define GGML_F16x8_ADD vaddq_f16 - #define GGML_F16x8_MUL vmulq_f16 - #define GGML_F16x8_REDUCE(res, x) \ - do { \ - int offset = GGML_F16_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ - } \ - const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ - const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ - (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ - } while (0) - - #define GGML_F16_VEC GGML_F16x8 - #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO - #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 - #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i]) - #define GGML_F16_VEC_FMA GGML_F16x8_FMA - #define GGML_F16_VEC_ADD GGML_F16x8_ADD - #define GGML_F16_VEC_MUL GGML_F16x8_MUL - #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE -#else - // if FP16 vector arithmetic is not supported, we use FP32 instead - // and take advantage of the vcvt_ functions to convert to/from FP16 - - #define GGML_F16_STEP 16 - #define GGML_F16_EPR 4 - - #define GGML_F32Cx4 float32x4_t - #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) - #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) - #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) - #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) - #define GGML_F32Cx4_ADD vaddq_f32 - #define GGML_F32Cx4_MUL vmulq_f32 - #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE - - #define GGML_F16_VEC GGML_F32Cx4 - #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO - #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 - #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i]) - #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA - #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD - #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL - #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE -#endif - -#elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) +#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) #define GGML_SIMD diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 437192d525a34..e3e0220517e06 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -18,73 +18,7 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G #if defined(GGML_SIMD) float sumf = 0.0f; - #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; - const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 - const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers - - const int np = (n & ~(ggml_f32_step - 1)); - svfloat32_t sum1 = svdup_n_f32(0.0f); - svfloat32_t sum2 = svdup_n_f32(0.0f); - svfloat32_t sum3 = svdup_n_f32(0.0f); - svfloat32_t sum4 = svdup_n_f32(0.0f); - svfloat32_t sum5 = svdup_n_f32(0.0f); - svfloat32_t sum6 = svdup_n_f32(0.0f); - svfloat32_t sum7 = svdup_n_f32(0.0f); - svfloat32_t sum8 = svdup_n_f32(0.0f); - svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8; - svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8; - for (int i = 0; i < np; i += ggml_f32_step) { - ax1 = GGML_F32_VEC_LOAD(x + i); - ay1 = GGML_F32_VEC_LOAD(y + i); - sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); - - ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); - ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); - sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2); - - ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); - ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); - sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3); - - ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); - ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); - sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4); - - ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); - ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); - sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5); - - ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); - ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); - sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6); - - ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); - ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); - sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7); - - ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); - ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); - sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8); - } - // leftovers - // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop - const int np2 = (n & ~(ggml_f32_epr - 1)); - for (int i = np; i < np2; i += ggml_f32_epr) { - ax1 = GGML_F32_VEC_LOAD(x + i); - ay1 = GGML_F32_VEC_LOAD(y + i); - sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); - } - // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only - if (np2 < n) { - svbool_t pg = svwhilelt_b32(np2, n); - ax1 = svld1_f32(pg, x + np2); - ay1 = svld1_f32(pg, y + np2); - sum1 = svmad_f32_m(pg, ax1, ay1, sum1); - } - // reduce sum1,sum2 to sum1 - GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8); - #elif defined(__riscv_v_intrinsic) + #if defined(__riscv_v_intrinsic) int vl = __riscv_vsetvlmax_e32m8(); vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1); vfloat32m8_t vsum; @@ -215,69 +149,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G #if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; //get vector length - const int ggml_f16_epr = sve_register_length / 16; // running when 16 - const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers - - const int np= (n & ~(ggml_f16_step - 1)); - svfloat16_t sum1 = svdup_n_f16(0.0f); - svfloat16_t sum2 = svdup_n_f16(0.0f); - svfloat16_t sum3 = svdup_n_f16(0.0f); - svfloat16_t sum4 = svdup_n_f16(0.0f); - - svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; - svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; - for (int i = 0; i < np; i += ggml_f16_step) { - ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); - sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1); - - ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); - sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2); - - ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); - ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); - sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3); - - ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); - ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4); - - ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); - ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5); - - ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); - ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6); - - ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); - ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7); - - ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); - ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8); - } - - const int np2 = (n & ~(ggml_f16_epr - 1)); // round down to multiple of 8 - for (int k = np; k < np2; k += ggml_f16_epr) { - svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); - svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); - sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry); - } - - if (np2 < n) { - svbool_t pg = svwhilelt_b16(np2, n); - svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); - svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); - - sum1 = svmad_f16_x(pg, hx, hy, sum1); - } - GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4); - #elif defined(__riscv_v_intrinsic) + #if defined(__riscv_v_intrinsic) #if defined(__riscv_zvfh) int vl = __riscv_vsetvlmax_e32m2(); vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1); diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h index ef334d089d1f7..cb447bbff1f59 100644 --- a/ggml/src/ggml-cpu/vec.h +++ b/ggml/src/ggml-cpu/vec.h @@ -118,150 +118,37 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); } -#if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - - const int sve_register_length = svcntb() * 8; - const int ggml_f16_epr = sve_register_length / 16; // running when 16 - const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers - - const int np = (n & ~(ggml_f16_step - 1)); - - svfloat16_t sum_00 = svdup_n_f16(0.0f); - svfloat16_t sum_01 = svdup_n_f16(0.0f); - svfloat16_t sum_02 = svdup_n_f16(0.0f); - svfloat16_t sum_03 = svdup_n_f16(0.0f); - - svfloat16_t sum_10 = svdup_n_f16(0.0f); - svfloat16_t sum_11 = svdup_n_f16(0.0f); - svfloat16_t sum_12 = svdup_n_f16(0.0f); - svfloat16_t sum_13 = svdup_n_f16(0.0f); - - svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; - svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; - - for (int i = 0; i < np; i += ggml_f16_step) { - ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements - - ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elemnst - sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1 - ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements - sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1); - - ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements - - ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 ekements - sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2); - ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1); - sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2); - - ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); - - ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2); - sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3); - ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2); - sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3); - - ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - - ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3); - sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4); - ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3); - sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4); - - ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - - ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4); - - sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5); - ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4); - sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5); - - ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - - ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5); - - sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6); - ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5); - sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6); - - ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - - ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6); - - sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7); - ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6); - sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7); - - ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - - ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7); - - sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8); - ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7); - sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8); - } - - const int np2 = (n & ~(ggml_f16_epr - 1)); - for (int k = np; k < np2; k += ggml_f16_epr) { - svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); - - svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0); - sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry); - rx = GGML_F16x_VEC_LOAD(x[1] + k, 0); - sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry); - } - - if (np2 < n) { - svbool_t pg = svwhilelt_b16(np2, n); - svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2)); - svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2)); - svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); - - sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00); - sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10); - } - GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03); - GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13); - #elif defined(__riscv_v_intrinsic) - // todo: RVV impl - for (int i = 0; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); - } - } - #else - const int np = (n & ~(GGML_F16_STEP - 1)); +#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic) + const int np = (n & ~(GGML_F16_STEP - 1)); - GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; + GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } }; - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j); - sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); - } + sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]); } } + } - // reduce sum0..sum3 to sum0 - for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { - GGML_F16_VEC_REDUCE(sumf[k], sum[k]); - } + // reduce sum0..sum3 to sum0 + for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) { + GGML_F16_VEC_REDUCE(sumf[k], sum[k]); + } - // leftovers - for (int i = np; i < n; ++i) { - for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); - } + // leftovers + for (int i = np; i < n; ++i) { + for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { + sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i])); } - #endif + } #else for (int i = 0; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { @@ -277,86 +164,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - - const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; - const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 - const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers - GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - - const int np = (n & ~(ggml_f32_step - 1)); - svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; - svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; - for (int i = 0; i < np; i += ggml_f32_step) { - - ax1 = GGML_F32_VEC_LOAD(x + i); - ay1 = GGML_F32_VEC_LOAD(y + i); - ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); - - GGML_F32_VEC_STORE(y + i, ay1); - - ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); - ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); - ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx); - - GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); - - ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); - ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); - ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx); - - GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3); - - ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); - ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); - ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx); - - GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4); - - ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); - ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); - ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx); - - GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5); - - ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); - ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); - ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx); - - GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6); - - ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); - ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); - ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx); - - GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7); - - ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); - ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); - ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx); - - GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8); - } - // leftovers - // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop - const int np2 = (n & ~(ggml_f32_epr - 1)); - for (int i = np; i < np2; i += ggml_f32_epr) { - ax1 = GGML_F32_VEC_LOAD(x + i); - ay1 = GGML_F32_VEC_LOAD(y + i); - ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx); - - GGML_F32_VEC_STORE(y + i, ay1); - } - // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only - if (np2 < n) { - svbool_t pg =svwhilelt_b32(np2, n); - ax1 = svld1_f32(pg, x + np2); - ay1 = svld1_f32(pg, y + np2); - ay1 = svmad_f32_m(pg, ax1, vx, ay1); - - svst1_f32(pg, y + np2, ay1); - } - #elif defined(__riscv_v_intrinsic) + #if defined(__riscv_v_intrinsic) for (int i = 0, avl; i < n; i += avl) { avl = __riscv_vsetvl_e32m8(n - i); vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl); @@ -396,113 +204,28 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const } inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) { -#if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; - const int ggml_f16_epr = sve_register_length / 16; - const int ggml_f16_step = 8 * ggml_f16_epr; - - GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); - - const int np= (n & ~(ggml_f16_step - 1)); - - svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; - svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; - for (int i = 0; i < np; i += ggml_f16_step) { - ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx); - - GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0); - - ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx); - - GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1); - - ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); - ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); - ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx); - - GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2); - - ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); - ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); - ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx); - - GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3); - - ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); - ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); - ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx); - - GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4); - - ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); - ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); - ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx); - - GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5); +#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic) + const int np = (n & ~(GGML_F16_STEP - 1)); - ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); - ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); - ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx); + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6); + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); - ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); - ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); - GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7); - } - const int np2 = (n & ~(ggml_f16_epr - 1)); - for (int k = np; k < np2; k += ggml_f16_epr) { - svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); - svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); - ry = GGML_F16x_VEC_FMA(ry, rx, vx); - - GGML_F16x_VEC_STORE(y + k, ry, 0); - } - - if (np2 < n) { - svbool_t pg = svwhilelt_b16(np2, n); - svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); - svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); - hy = svmad_f16_x(pg, hx, vx, hy); - svst1_f16(pg, (__fp16 *)(y + np2), hy); - } - - #elif defined(__riscv_v_intrinsic) - // todo: RVV impl - // scalar - for (int i = 0; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); - } - #else - const int np = (n & ~(GGML_F16_STEP - 1)); - - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - - GGML_F16_VEC ax[GGML_F16_ARR]; - GGML_F16_VEC ay[GGML_F16_ARR]; - - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); - - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); - } + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); } + } - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); - } - #endif + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v); + } #else // scalar for (int i = 0; i < n; ++i) { @@ -523,14 +246,7 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int } #if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - // scalar Route to scalar implementation //TODO: Write SVE code - for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) { - for (int i = 0; i < n; ++i) { - y[i] += x[k][i]*v[k][0]; - } - } - #elif defined(__riscv_v_intrinsic) + #if defined(__riscv_v_intrinsic) for (int i = 0, avl; i < n; i += avl) { avl = __riscv_vsetvl_e32m8(n - i); vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); @@ -586,12 +302,7 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co #if defined(GGML_USE_ACCELERATE) vDSP_vsmsa(x, 1, &s, &b, y, 1, n); #elif defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - // scalar ; TODO: Write SVE code - for (int i = 0; i < n; ++i) { - y[i] = x[i]*s + b; - } - #elif defined(__riscv_v_intrinsic) + #if defined(__riscv_v_intrinsic) for (int i = 0, avl; i < n; i += avl) { avl = __riscv_vsetvl_e32m8(n - i); vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl); @@ -634,33 +345,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #if defined(GGML_USE_ACCELERATE) vDSP_vsmul(y, 1, &v, y, 1, n); #elif defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; - const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16 - const int ggml_f32_step = 2 * ggml_f32_epr; - - GGML_F32_VEC vx = GGML_F32_VEC_SET1(v); - const int np = (n & ~(ggml_f32_step - 1)); - svfloat32_t ay1; - svfloat32_t ay2; - for (int i = 0; i < np; i += ggml_f32_step) { - ay1 = GGML_F32_VEC_LOAD(y + i); - ay1 = GGML_F32_VEC_MUL(ay1, vx); - GGML_F32_VEC_STORE(y + i, ay1); - - ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); - ay2 = GGML_F32_VEC_MUL(ay2, vx); - GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2); - } - // leftovers - // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only - if (np < n) { - svbool_t pg = svwhilelt_b32(np, n); - ay1 = svld1_f32(pg, y + np); - ay1 = svmul_f32_m(pg, ay1, vx); - svst1_f32(pg, y + np, ay1); - } - #elif defined(__riscv_v_intrinsic) + #if defined(__riscv_v_intrinsic) for (int i = 0, avl; i < n; i += avl) { avl = __riscv_vsetvl_e32m8(n - i); vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl); @@ -697,60 +382,26 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { } inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { -#if defined(GGML_SIMD) - #if defined(__ARM_FEATURE_SVE) - const int sve_register_length = svcntb() * 8; - const int ggml_f16_epr = sve_register_length / 16; - const int ggml_f16_step = 2 * ggml_f16_epr; - - GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v); - const int np = (n & ~(ggml_f16_step - 1)); - svfloat16_t ay1, ay2; - - for (int i = 0; i < np; i += ggml_f16_step) { - ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0); - ay1 = GGML_F16x_VEC_MUL(ay1, vx); - GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0); - - ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1); - ay2 = GGML_F16x_VEC_MUL(ay2, vx); - GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1); - } - // leftovers - // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only - if (np < n) { - svbool_t pg = svwhilelt_b16(np, n); - svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np)); - svfloat16_t out = svmul_f16_m(pg, hy, vx); - svst1_f16(pg, (__fp16 *)(y + np), out); - } - #elif defined(__riscv_v_intrinsic) - // todo: RVV impl - // scalar - for (int i = 0; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); - } - #else - const int np = (n & ~(GGML_F16_STEP - 1)); +#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic) + const int np = (n & ~(GGML_F16_STEP - 1)); - GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); - GGML_F16_VEC ay[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; - for (int i = 0; i < np; i += GGML_F16_STEP) { - for (int j = 0; j < GGML_F16_ARR; j++) { - ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); - ay[j] = GGML_F16_VEC_MUL(ay[j], vx); + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_MUL(ay[j], vx); - GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); - } + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); } + } - // leftovers - for (int i = np; i < n; ++i) { - y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); - } - #endif + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v); + } #else // scalar for (int i = 0; i < n; ++i) {