From 18ffe4c2939bc5a8b9f9ce58970ac5ecfa6eb53a Mon Sep 17 00:00:00 2001 From: "Ahmed Abdelhakim (Si-Vision)" Date: Tue, 29 Jun 2021 12:18:41 +0200 Subject: [PATCH] use mlu_hi for leaky/prelu sa8 --- .../transform/impl/mli_krn_leaky_relu_ref.h | 106 ++++++++++++------ .../transform/impl/mli_krn_leaky_relu_vdsp.h | 50 ++++++--- .../transform/impl/mli_krn_prelu_dsp.h | 6 +- .../transform/impl/mli_krn_prelu_ref.h | 17 ++- .../transform/impl/mli_krn_prelu_vdsp.h | 66 ++++++----- lib/src/kernels/transform/mli_krn_prelu.h | 6 +- .../kernels/transform/mli_krn_prelu_decl.h | 6 +- .../tests_mli_krn_leaky_relu.cc | 6 +- .../mli_krn_prelu/tests_mli_krn_prelu.cc | 14 +-- 9 files changed, 162 insertions(+), 115 deletions(-) diff --git a/lib/src/kernels/transform/impl/mli_krn_leaky_relu_ref.h b/lib/src/kernels/transform/impl/mli_krn_leaky_relu_ref.h index 17ab6b18..ea90fa7f 100644 --- a/lib/src/kernels/transform/impl/mli_krn_leaky_relu_ref.h +++ b/lib/src/kernels/transform/impl/mli_krn_leaky_relu_ref.h @@ -62,17 +62,17 @@ static MLI_FORCE_INLINE void compute_leaky_relu( int8_t input = vec_in[0]; int32_t output; if (input >= in_zp) { - /* out_sa8 = (idendity_scale * in_sa8) * 2^(-(identity_shift)) + identity_offset */ - output = mli_math_add_fx( - mli_math_asr_rnd_fx( - mli_math_mul_fx(identity_params->scale, input), - identity_params->shift), identity_params->offset); + /* out_sa8 = (idendity_scale * (in_sa8 - in_zp)) * 2^(-(identity_shift)) + identity_offset */ + int16_t input_sub = mli_math_sub_fx((int16_t)input, in_zp); + output = mli_math_asr_rnd_fx( + mli_math_mul_fx(identity_params->scale, input_sub), identity_params->shift); + output = mli_math_add_fx(output, (int32_t)identity_params->offset); } else { - /* out_sa8 = (alpha_scale * in_sa8) * 2^(-(alpha_shift)) + alpha_offset */ - output = mli_math_add_fx( - mli_math_asr_rnd_fx( - mli_math_mul_fx(alpha_params->scale, input), - alpha_params->shift), alpha_params->offset); + /* out_sa8 = (alpha_scale * (in_sa8 - in_zp)) * 2^(-(alpha_shift)) + alpha_offset */ + int16_t input_sub = mli_math_sub_fx((int16_t)input, in_zp); + output = mli_math_asr_rnd_fx( + mli_math_mul_fx(alpha_params->scale, input_sub), alpha_params->shift); + output = mli_math_add_fx(output, (int32_t)alpha_params->offset); } vec_out[0] = mli_math_cast_fx(output, 0); @@ -201,13 +201,64 @@ static MLI_FORCE_INLINE mli_status leaky_relu_fx_run(const mli_tensor *in, return MLI_STATUS_OK; } -static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE void leaky_relu_define_identity_params(const mli_tensor *in, const mli_tensor *out, + s8asym_quant_params *params) { + + /* **************************************************************************************************************** + * Mathematical Derivations out_sa8 Requantization Params to use with in_sa8 + * ---------------------------------------------------------------------------------------------------------------- + * out_sa8 = (in_scale_val/out_scale_val) *(in_sa8 - in_zp) + out_zp + * = scale_val * (in_sa8 - in_zp) + out_zp + * where: + * + * scale_val = in_scale_val / out_scale_val; + * = in_scale * 2^(-in_scale_frac_bits) / (out_scale * 2^(-out_scale_frac_bits)); + * = (in_scale_val * 2^kPreDivShift / out_scale_val) + * * 2^(-(kPreDivShift + in_scale_frac_bits - out_scale_frac_bits)); + * = (in_scale_val * 2^kPreDivShift / out_scale_val) * 2^(-norm_shift) + * * 2^(-(kPreDivShift + in_scale_frac_bits - out_scale_frac_bits - norm_shift)); + * = (in_scale_val * 2^kPreDivShift / out_scale_val) * 2^(-norm_shift) + * * 2^(-scale_shift) + * = scale * 2 ^(-(scale_shift)) + * + * where scale = (in_scale_val * 2^kPreDivShift / out_scale_val) * 2^(-norm_shift) + * scale_shift = kPreDivShift + in_scale_frac_bits - out_scale_frac_bits - norm_shift + * norm_shift is the shift value due to normalizing the result of + * (in_scale_val * 2^kPreDivShift / out_scale_val) and casting it from int32_t to int16_t + * kPreDivShift is derived from norm_32(in_scale) - norm_16(out_scale) + * + * offset = out_zp + * + * ***************************************************************************************************************/ + + int16_t scale_in = mli_hlp_tensor_scale(in, 0); + int16_t scale_out = mli_hlp_tensor_scale(out, 0); + int16_t out_zp = out->el_params.sa.zero_point.mem.i16; + int kPreDivShift = mli_math_norm_fx(scale_in) - + mli_math_norm_fx(scale_out); + /* Normalize In/Out Scale ratio and cast to 16bit */ + int norm_shift; + params->scale = mli_math_norm_cast_fx( + ((int32_t)(scale_in) << kPreDivShift) / + scale_out, &norm_shift); + + params->shift = kPreDivShift; + params->shift += mli_hlp_tensor_scale_shift(in, 0) - mli_hlp_tensor_scale_shift(out, 0); + params->shift -= norm_shift; + params->offset = out_zp; + int shift_left = mli_math_max_fx(-params->shift, 0); + params->scale = mli_math_asl_fx(params->scale, shift_left); + params->shift = mli_math_max_fx(params->shift, 0); + + +} +static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, const int8_t alpha_sa8, const s8asym_quant_params *identity_params) { - /* **************************************************************************************************************** + /* **************************************************************************************************************** * Mathematical Derivations out_sa8 Requantization Params with alpha scale to use with in_sa8 * ---------------------------------------------------------------------------------------------------------------- * First we need to define Quantization Params for In/Out @@ -220,8 +271,6 @@ static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(con * out_sa8 = (in_scale_val/out_scale_val) * alpha_scale * (alpha_sa8 - alpha_zp) * (in_sa8 - in_zp) + out_zp * = scale_val * alpha_val * (alpha_sa8 - alpha_zp) * (in_sa8 - in_zp) + out_zp * = scale_alpha_val * (in_sa8 - in_zp) + out_zp - * = scale_alpha_val * in_sa8 + out_zp - scale_alpha_val * in_zp - * = scale_alpha_val * in_sa8 + scale_alpha_offset; * * For scale_alpha_val = scale_val * alpha_val * (alpha_sa8 - alpha_zp) * = scale * 2 ^(-(scale_shift)) * alpha_scale @@ -244,10 +293,10 @@ static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(con * scale * alpha_scale * (alpha_sa8 - alpha_zp) * 2^(-(alpha_norm_shift)) * scale_alpha_shift = scale_shift + alpha_scale_frac_bits - alpha_norm_shift - scale_mul_norm_shift * - * scale_alpha_offset = out_zp - scale_alpha_val * in_zp - * = out_zp - (scale_alpha * in_zp) * 2^(-(scale_alpha_shift)); + * scale_alpha_offset = out_zp * * ***************************************************************************************************************/ + int16_t out_zp = out->el_params.sa.zero_point.mem.i16; int32_t alpha_val = mli_prv_convert_sa8_fx16(alpha_sa8, slope_coeff->el_params.sa.zero_point.mem.i16, @@ -263,19 +312,12 @@ static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(con int16_t scale_alpha = mli_math_norm_cast_fx( mli_math_mul_fx(identity_params->scale, alpha), &norm_shift); scale_alpha_shift -= norm_shift; - - int16_t in_zp = in->el_params.sa.zero_point.mem.i16; - int16_t out_zp = out->el_params.sa.zero_point.mem.i16; - - int16_t scale_alpha_offset = mli_math_sub_fx(out_zp, - mli_math_cast_fx( - mli_math_mul_fx(scale_alpha, in_zp), scale_alpha_shift)); /* Define Quantization params for (In * alpha / out) ratio */ s8asym_quant_params alpha_params; alpha_params.scale = scale_alpha; alpha_params.shift = scale_alpha_shift; - alpha_params.offset = scale_alpha_offset; + alpha_params.offset = out_zp; return alpha_params; } @@ -303,22 +345,16 @@ static MLI_FORCE_INLINE mli_status leaky_relu_sa8_run(const mli_tensor *in, * Mathematical Derivations for Leaky RELU SA8 * ---------------------------------------------------------------------------------------------------------------- * If (in_sa8 >= in_zp) - * out_sa8 = (idendity_scale * in_sa8) * 2^(-(identity_shift)) + identity_offset; + * out_sa8 = (idendity_scale * (in_sa8 - in_zp)) * 2^(-(identity_shift)) + identity_offset; * else - * out_sa8 = (alpha_scale * in_sa8) * 2^(-(alpha_shift)) + alpha_offset; + * out_sa8 = (alpha_scale * (in_sa8 - in_zp)) * 2^(-(alpha_shift)) + alpha_offset; * - * check leaky_relu_define_requant_params for more Documentation + * check leaky_relu_define_alpha_params for more Documentation * ***************************************************************************************************************/ s8asym_quant_params identity_params; /* Define Requantization Params for In/Out scale ratio */ - define_requant_params(in, out, &identity_params); - int shift_left = mli_math_max_fx(-identity_params.shift, 0); - identity_params.scale = mli_math_asl_fx(identity_params.scale, shift_left); - identity_params.shift = mli_math_max_fx(identity_params.shift, 0); - s8asym_quant_params alpha_params = leaky_relu_define_requant_params(in, slope_coeff, out, scale, &identity_params); - shift_left = mli_math_max_fx(-alpha_params.shift, 0); - alpha_params.scale = mli_math_asl_fx(alpha_params.scale, shift_left); - alpha_params.shift = mli_math_max_fx(alpha_params.shift, 0); + leaky_relu_define_identity_params(in, out, &identity_params); + s8asym_quant_params alpha_params = leaky_relu_define_alpha_params(in, slope_coeff, out, scale, &identity_params); /* Dummy Load to get num_lanes */ auto input = mli_prv_load_1vec(in_ptr); diff --git a/lib/src/kernels/transform/impl/mli_krn_leaky_relu_vdsp.h b/lib/src/kernels/transform/impl/mli_krn_leaky_relu_vdsp.h index 05270ca5..73b0a972 100644 --- a/lib/src/kernels/transform/impl/mli_krn_leaky_relu_vdsp.h +++ b/lib/src/kernels/transform/impl/mli_krn_leaky_relu_vdsp.h @@ -43,8 +43,10 @@ static MLI_FORCE_INLINE vNx2short_t calc_leaky_relu( pvNx2 sel = init_predicate(input > 0); vNx2short_t neg; if ( shift > mul_hi_shift) { + constexpr int max_shift_right = 15; + int shift_right = mli_math_min_fx(shift - mul_hi_shift, max_shift_right); neg = mli_math_mul_fx_high(input, scale); - neg = mli_math_asr_rnd_fx(neg, shift - mul_hi_shift); + neg = mli_math_asr_rnd_fx(neg, shift_right); } else { vNx2accint_t acc = mli_math_mul_fx(input, scale); neg = mli_math_acc_cast_fx(acc, shift); @@ -110,33 +112,47 @@ static MLI_FORCE_INLINE vNx4char_t calc_leaky_relu( /* Load Input */ vNx4char_t input = mli_prv_load_1vec(vec_in); vNx4short_t input_cast = mli_math_cast_fx(input); + input_cast = mli_math_sub(input_cast, in_zp); pvNx4 select = init_predicate(input >= in_zp); + /* + * shifting more than 24 is not needed + * as the scaled result = ((input - in_offset) * scale) will be limited by 24 bits. + */ + constexpr int max_shift = 24; + constexpr int mul_hi_shift = 16; + + int identity_shift = mli_math_min_fx(identity_params->shift, max_shift); + identity_shift -= mul_hi_shift; + int shift_left = mli_math_max_fx(1 - identity_shift, 0); + int shift_right = mli_math_max_fx(identity_shift, 1); - int identity_shift = identity_params->shift; - int identity_offset = (int)identity_params->offset << identity_shift; + int16_t identity_offset = identity_params->offset << shift_right; #ifdef ROUND_UP - identity_offset += (int)(((uint32_t)1 << identity_shift) >> 1); + identity_offset += (int16_t)(((uint16_t)1 << shift_right) >> 1); #else #error Rounding mode not supported #endif - vNx4int_t input_identity_scale = mli_math_mul_fx(input_cast, identity_params->scale); - input_identity_scale = mli_math_add(input_identity_scale, identity_offset); - input_identity_scale = mli_math_asr_fx(input_identity_scale, identity_shift); + vNx4short_t input_identity_cast = mli_math_asl_fx(input_cast, shift_left); + vNx4short_t input_identity_scale = mli_math_mul_fx_high(input_identity_cast, identity_params->scale); + input_identity_scale = mli_math_add_fx(input_identity_scale, (vNx4short_t)identity_offset); + vNx4char_t output_identity = mli_math_cast_fx(input_identity_scale, shift_right); - vNx4char_t output_identity = mli_math_cast_fx(input_identity_scale); + int alpha_shift = mli_math_min_fx(alpha_params->shift, max_shift); + alpha_shift -= mul_hi_shift; + shift_left = mli_math_max_fx(1 - alpha_shift, 0); + shift_right = mli_math_max_fx(alpha_shift, 1); - int alpha_shift = alpha_params->shift; - int alpha_offset = (int)alpha_params->offset << alpha_shift; + int16_t alpha_offset = alpha_params->offset << shift_right; #ifdef ROUND_UP - alpha_offset += (int)(((uint32_t)1 << alpha_shift) >> 1); + alpha_offset += (int16_t)(((uint16_t)1 << shift_right) >> 1); #else #error Rounding mode not supported #endif - vNx4int_t input_alpha_scale = mli_math_mul_fx(input_cast, alpha_params->scale); - input_alpha_scale = mli_math_add(input_alpha_scale, alpha_offset); - input_alpha_scale = mli_math_asr_fx(input_alpha_scale, alpha_shift); - - vNx4char_t output_alpha = mli_math_cast_fx(input_alpha_scale); + vNx4short_t input_alpha_cast = mli_math_asl_fx(input_cast, shift_left); + vNx4short_t input_alpha_scale = mli_math_mul_fx_high(input_alpha_cast, alpha_params->scale); + input_alpha_scale = mli_math_add_fx(input_alpha_scale, (vNx4short_t)alpha_offset); + + vNx4char_t output_alpha = mli_math_cast_fx(input_alpha_scale, shift_right); return mli_math_select_fx(select, output_identity, output_alpha); } @@ -182,7 +198,7 @@ static MLI_FORCE_INLINE void compute_leaky_relu_sa8_inner_loop( vec_out += remaining_part; } -#pragma clang loop unroll_count(2) +#pragma clang loop unroll_count(4) for (int pos3 = remaining_part; pos3 < count; pos3 += num_lanes) { compute_leaky_relu(vec_in, vec_out, in_zp, identity_params, alpha_params); vec_in += num_lanes; diff --git a/lib/src/kernels/transform/impl/mli_krn_prelu_dsp.h b/lib/src/kernels/transform/impl/mli_krn_prelu_dsp.h index 414586ea..d99ec1f8 100644 --- a/lib/src/kernels/transform/impl/mli_krn_prelu_dsp.h +++ b/lib/src/kernels/transform/impl/mli_krn_prelu_dsp.h @@ -60,15 +60,15 @@ static MLI_FORCE_INLINE void compute_prelu( mli_prv_store_1_sample(vec_out, calc_prelu(input, scale, shift)); } -static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, const v2q15_t alpha_sa8, const s8asym_quant_params *identity_params) { - s8asym_quant_params scale0 = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out, + s8asym_quant_params scale0 = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out, alpha_sa8[0], identity_params); - s8asym_quant_params scale1 = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out, + s8asym_quant_params scale1 = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out, alpha_sa8[1], identity_params); s8asym_quant_params_v alpha_params; alpha_params.scale = mli_prv_init_v(scale0.scale, scale1.scale ); diff --git a/lib/src/kernels/transform/impl/mli_krn_prelu_ref.h b/lib/src/kernels/transform/impl/mli_krn_prelu_ref.h index 2d4d180f..3d2a0a5d 100644 --- a/lib/src/kernels/transform/impl/mli_krn_prelu_ref.h +++ b/lib/src/kernels/transform/impl/mli_krn_prelu_ref.h @@ -210,13 +210,13 @@ static MLI_FORCE_INLINE mli_status prelu_fx_run(const mli_tensor *in, return MLI_STATUS_OK; } -static MLI_FORCE_INLINE s8asym_quant_params prelu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE s8asym_quant_params prelu_define_requant_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, const int8_t alpha_sa8, const s8asym_quant_params *identity_params) { - return mli::krn::ref::leaky_relu_define_requant_params(in, slope_coeff, out, alpha_sa8, identity_params); + return mli::krn::ref::leaky_relu_define_alpha_params(in, slope_coeff, out, alpha_sa8, identity_params); } static MLI_FORCE_INLINE void compute_prelu_broadcast( @@ -252,7 +252,7 @@ static MLI_FORCE_INLINE void compute_prelu_broadcast( const MLI_PTR(int8_t) vec_in = (MLI_PTR(int8_t))in_prv.ptr + scale_idx * axis_in_mem_stride; MLI_OUT_PTR(int8_t) vec_out = out_prv.ptr + scale_idx * axis_out_mem_stride; /* Load Scale Elem */ - auto alpha_params = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out, + auto alpha_params = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out, slope_ptr[scale_idx], identity_params); /* Loop Over Sub Tensor */ @@ -346,14 +346,11 @@ static MLI_FORCE_INLINE mli_status prelu_sa8_run(const mli_tensor *in, * else * out_sa8 = (alpha_scale * in_sa8) * 2^(-(alpha_shift)) + alpha_offset; * - * check prelu_define_requant_params for more Documentation + * check prelu_define_requant_alpha_params for more Documentation * ***************************************************************************************************************/ s8asym_quant_params identity_params; /* Define Requantization Params for In/Out scale ratio */ - define_requant_params(in, out, &identity_params); - int shift_left = mli_math_max_fx(-identity_params.shift, 0); - identity_params.scale = mli_math_asl_fx(identity_params.scale, shift_left); - identity_params.shift = mli_math_max_fx(identity_params.shift, 0); + leaky_relu_define_identity_params(in, out, &identity_params); /* Input Zero Point */ int16_t in_zp = in->el_params.sa.zero_point.mem.i16; @@ -376,7 +373,7 @@ static MLI_FORCE_INLINE mli_status prelu_sa8_run(const mli_tensor *in, if (remaining_part) { /* Load Scale Vector */ auto scale_v = mli_prv_load_1vec(slope_ptr); - auto alpha_params = mli::krn::prelu_define_requant_params(in, slope_coeff, out, scale_v, &identity_params); + auto alpha_params = mli::krn::prelu_define_requant_alpha_params(in, slope_coeff, out, scale_v, &identity_params); mli::krn::compute_prelu_no_broadcast(in_ptr, out_ptr, in_zp, &identity_params, &alpha_params, in_prv, out_prv, remaining_part); @@ -387,7 +384,7 @@ static MLI_FORCE_INLINE mli_status prelu_sa8_run(const mli_tensor *in, vec_out = out_ptr + scale_idx * axis_out_mem_stride; /* Load Scale Vector */ auto scale_v = mli_prv_load_1vec(&slope_ptr[scale_idx]); - auto alpha_params = mli::krn::prelu_define_requant_params(in, slope_coeff, out, scale_v, &identity_params); + auto alpha_params = mli::krn::prelu_define_requant_alpha_params(in, slope_coeff, out, scale_v, &identity_params); mli::krn::compute_prelu_no_broadcast(vec_in, vec_out, in_zp, &identity_params, &alpha_params, in_prv, out_prv); diff --git a/lib/src/kernels/transform/impl/mli_krn_prelu_vdsp.h b/lib/src/kernels/transform/impl/mli_krn_prelu_vdsp.h index fd8d05db..5f9118f9 100644 --- a/lib/src/kernels/transform/impl/mli_krn_prelu_vdsp.h +++ b/lib/src/kernels/transform/impl/mli_krn_prelu_vdsp.h @@ -167,12 +167,12 @@ MLI_FORCE_INLINE void compute_prelu( mli_prv_stride_store_n_samples(vec_out, calc_prelu(input, scale, shift), stride_out, remaining_part); } -static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, const vNx4char_t alpha_sa8, const s8asym_quant_params *identity_params) { - + int16_t out_zp = out->el_params.sa.zero_point.mem.i16; vNx4int_t alpha_val = mli_prv_convert_sa8_fx32(alpha_sa8, slope_coeff->el_params.sa.zero_point.mem.i16, slope_coeff->el_params.sa.scale.mem.i16); @@ -188,29 +188,12 @@ static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_params(const mli_math_mul_fx(identity_params->scale, alpha), &norm_shift); scale_alpha_shift -= norm_shift; - int16_t in_zp = in->el_params.sa.zero_point.mem.i16; - int16_t out_zp = out->el_params.sa.zero_point.mem.i16; - - vNx4int_t shift_left = mli_math_max_fx(-scale_alpha_shift, 0); - vNx4int_t shift_right = mli_math_max_fx(scale_alpha_shift, 0); - - vNx4int_t scale_zp = mli_math_mul_fx(scale_alpha, in_zp); - scale_zp = mli_math_asl_fx(scale_zp, shift_left); - scale_zp = mli_math_asr_rnd_fx(scale_zp, shift_right); - - vNx4short_t scale_alpha_offset = mli_math_sub_fx(out_zp, - mli_math_cast_fx(scale_zp, 0)); - /* Define Quantization params for (In * alpha / out) ratio */ s8asym_quant_params_v alpha_params; alpha_params.scale = scale_alpha; alpha_params.shift = mli_math_cast_fx(scale_alpha_shift); - alpha_params.offset = scale_alpha_offset; + alpha_params.offset = (vNx4short_t)out_zp; - /* Apply Shifting Left over Scale value */ - vNx4short_t shift_l = mli_math_max_fx(-alpha_params.shift, 0); - alpha_params.scale = mli_math_asl_fx(alpha_params.scale, shift_l); - alpha_params.shift = mli_math_max_fx(alpha_params.shift, 0); return alpha_params; } @@ -221,28 +204,43 @@ static MLI_FORCE_INLINE vNx4char_t calc_prelu( const s8asym_quant_params_v *alpha_params) { vNx4short_t input_cast = mli_math_cast_fx(input); + input_cast = mli_math_sub(input_cast, in_zp); pvNx4 select = init_predicate(input >= in_zp); + /* + * shifting more than 24 is not needed + * as the scaled result = ((input - in_offset) * scale) will be limited by 24 bits. + */ + constexpr int max_shift = 24; + constexpr int mul_hi_shift = 16; int identity_shift = identity_params->shift; - int identity_offset = (int)identity_params->offset << identity_shift; + identity_shift = mli_math_min_fx(identity_params->shift, max_shift); + identity_shift -= mul_hi_shift; + int shift_left = mli_math_max_fx(1 - identity_shift, 0); + int shift_right = mli_math_max_fx(identity_shift, 1); + int16_t identity_offset = identity_params->offset << shift_right; #ifdef ROUND_UP - identity_offset += (int)(((uint32_t)1 << identity_shift) >> 1); + identity_offset += (int)(((uint16_t)1 << shift_right) >> 1); #else #error Rounding mode not supported #endif - vNx4int_t input_identity_scale = mli_math_mul_fx(identity_params->scale, input_cast); - input_identity_scale = mli_math_add(input_identity_scale, identity_offset); - input_identity_scale = mli_math_asr_fx(input_identity_scale, identity_shift); + vNx4short_t input_cast1 = mli_math_asl_fx(input_cast, shift_left); + vNx4short_t input_identity_scale = mli_math_mul_fx_high(identity_params->scale, input_cast1); + input_identity_scale = mli_math_add_fx(input_identity_scale, (vNx4short_t)identity_offset); + + vNx4char_t output_identity = mli_math_cast_fx(input_identity_scale, shift_right); - vNx4char_t output_identity = mli_math_cast_fx(input_identity_scale); + vNx4short_t alpha_shift = mli_math_min_fx(alpha_params->shift, max_shift); + alpha_shift -= mul_hi_shift; + vNx4short_t shift_left1 = mli_math_max_fx(1 - alpha_shift, 0); + vNx4short_t shift_right1 = mli_math_max_fx(alpha_shift, 1); - vNx4int_t alpha_shift = mli_math_cast_fx(alpha_params->shift); - vNx4int_t input_alpha_scale = mli_math_mul_fx(alpha_params->scale, input_cast); - input_alpha_scale = mli_math_asr_rnd_fx(input_alpha_scale, alpha_shift); - input_alpha_scale = mli_math_add(input_alpha_scale, - mli_math_cast_fx(alpha_params->offset)); + vNx4short_t input_cast2 = mli_math_asl_fx(input_cast, shift_left1); + vNx4short_t input_alpha_scale = mli_math_mul_fx_high(input_cast2, alpha_params->scale); + input_alpha_scale = mli_math_asr_rnd_fx(input_alpha_scale, shift_right1); + input_alpha_scale = mli_math_add_fx(input_alpha_scale, alpha_params->offset); - vNx4char_t output_alpha = mli_math_cast_fx(input_alpha_scale); + vNx4char_t output_alpha = mli_math_cast_fx(input_alpha_scale); return mli_math_select_fx(select, output_identity, output_alpha); } @@ -444,7 +442,7 @@ static MLI_FORCE_INLINE void compute_prelu_broadcast( MLI_OUT_PTR(int8_t) vec_out = out_prv.ptr + scale_idx * axis_out_mem_stride; /* Load Scale Vector */ auto scale_v = mli_prv_load_1vec(slope_ptr); - auto alpha_params = mli::krn::prelu_define_requant_params(in, slope_coeff, out, scale_v, identity_params); + auto alpha_params = mli::krn::prelu_define_requant_alpha_params(in, slope_coeff, out, scale_v, identity_params); /* Loop Over Sub Tensor */ const MLI_PTR(int8_t) orig_vec_in = vec_in; MLI_OUT_PTR(int8_t) orig_vec_out = vec_out; @@ -468,7 +466,7 @@ static MLI_FORCE_INLINE void compute_prelu_broadcast( const MLI_PTR(int8_t) vec_in = (MLI_PTR(int8_t))in_prv.ptr + scale_idx * axis_in_mem_stride; MLI_OUT_PTR(int8_t) vec_out = out_prv.ptr + scale_idx * axis_out_mem_stride; /* Load Scale Elem */ - auto alpha_params = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out, + auto alpha_params = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out, slope_ptr[scale_idx], identity_params); /* Loop Over Sub Tensor */ diff --git a/lib/src/kernels/transform/mli_krn_prelu.h b/lib/src/kernels/transform/mli_krn_prelu.h index 6ad6277e..4da41483 100644 --- a/lib/src/kernels/transform/mli_krn_prelu.h +++ b/lib/src/kernels/transform/mli_krn_prelu.h @@ -24,7 +24,7 @@ namespace mli { namespace krn { #if !defined(MLI_BUILD_REFERENCE) && defined(__Xvec_width) using mli::krn::vdsp::compute_prelu; -using mli::krn::vdsp::prelu_define_requant_params; +using mli::krn::vdsp::prelu_define_requant_alpha_params; using mli::krn::vdsp::compute_prelu_no_broadcast; using mli::krn::vdsp::compute_prelu_broadcast; using mli::krn::ref::prelu_fx_run; @@ -32,7 +32,7 @@ using mli::krn::ref::prelu_sa8_run; #elif !defined(MLI_BUILD_REFERENCE) && defined(__FXAPI__) using mli::krn::dsp::compute_prelu; -using mli::krn::dsp::prelu_define_requant_params; +using mli::krn::dsp::prelu_define_requant_alpha_params; using mli::krn::dsp::compute_prelu_no_broadcast; using mli::krn::ref::compute_prelu_broadcast; using mli::krn::ref::prelu_fx_run; @@ -40,7 +40,7 @@ using mli::krn::ref::prelu_sa8_run; #else using mli::krn::ref::compute_prelu; -using mli::krn::ref::prelu_define_requant_params; +using mli::krn::ref::prelu_define_requant_alpha_params; using mli::krn::ref::compute_prelu_no_broadcast; using mli::krn::ref::compute_prelu_broadcast; using mli::krn::ref::prelu_fx_run; diff --git a/lib/src/kernels/transform/mli_krn_prelu_decl.h b/lib/src/kernels/transform/mli_krn_prelu_decl.h index a540077f..abc0fb04 100644 --- a/lib/src/kernels/transform/mli_krn_prelu_decl.h +++ b/lib/src/kernels/transform/mli_krn_prelu_decl.h @@ -104,7 +104,7 @@ static MLI_FORCE_INLINE void compute_prelu_no_broadcast( const generic_tensor_private_t out_prv, const int remaining_part = 0); -static MLI_FORCE_INLINE s8asym_quant_params prelu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE s8asym_quant_params prelu_define_requant_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, const int8_t alpha_sa8, @@ -144,7 +144,7 @@ static MLI_FORCE_INLINE void compute_prelu( const int remaining_part); #if !defined(MLI_BUILD_REFERENCE) && defined(__FXAPI__) -static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, const v2q15_t alpha_sa8, @@ -239,7 +239,7 @@ MLI_FORCE_INLINE void compute_prelu( const int shift, const int remaining_part); -static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_params(const mli_tensor *in, +static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_alpha_params(const mli_tensor *in, const mli_tensor *slope_coeff, mli_tensor *out, vNx4char_t alpha_sa8, diff --git a/user_tests/tests/mli_krn_leaky_relu/tests_mli_krn_leaky_relu.cc b/user_tests/tests/mli_krn_leaky_relu/tests_mli_krn_leaky_relu.cc index a593e9e1..4de811db 100644 --- a/user_tests/tests/mli_krn_leaky_relu/tests_mli_krn_leaky_relu.cc +++ b/user_tests/tests/mli_krn_leaky_relu/tests_mli_krn_leaky_relu.cc @@ -47,9 +47,9 @@ struct leaky_relu_test_operands { #if defined(CRC_RM_CONVERGENT) || defined(CRC_RM_UP) // Shared CRC Results -const crc32_calc test_1_chksum_fx16{ 0x7695FBF8 }, test_1_chksum_sa8{ 0x97AACEC4 }, - test_2_chksum_fx16{ 0x0E245804 }, test_2_chksum_sa8{ 0x6A2A3EB5 }, - test_3_chksum_fx16{ 0xB4A6991D }, test_3_chksum_sa8{ 0x6FEF833F }; +const crc32_calc test_1_chksum_fx16{ 0x7695FBF8 }, test_1_chksum_sa8{ 0x8C55B2DF }, + test_2_chksum_fx16{ 0x0E245804 }, test_2_chksum_sa8{ 0x6A3A8FB7 }, + test_3_chksum_fx16{ 0xB4A6991D }, test_3_chksum_sa8{ 0xEE1C5F07 }; #else // Not defined CRC_* const crc32_calc test_1_chksum_fx16, test_1_chksum_sa8, diff --git a/user_tests/tests/mli_krn_prelu/tests_mli_krn_prelu.cc b/user_tests/tests/mli_krn_prelu/tests_mli_krn_prelu.cc index d77f7d8b..46530bea 100644 --- a/user_tests/tests/mli_krn_prelu/tests_mli_krn_prelu.cc +++ b/user_tests/tests/mli_krn_prelu/tests_mli_krn_prelu.cc @@ -50,13 +50,13 @@ struct prelu_test_operands { #if defined(CRC_RM_CONVERGENT) || defined(CRC_RM_UP) // Shared CRC Results -const crc32_calc test_1_chksum_fx16{ 0x92934920 }, test_1_chksum_sa8{ 0x21038039 }, - test_2_chksum_fx16{ 0x0C8AFCA5 }, test_2_chksum_sa8{ 0x5ACCCB5A }, - test_3_chksum_fx16{ 0xD2F8214F }, test_3_chksum_sa8{ 0x464AE450 }, - test_4_chksum_fx16{ 0xBBED4B5D }, test_4_chksum_sa8{ 0xE24BFAED }, - test_5_chksum_fx16{ 0x6BAA528A }, test_5_chksum_sa8{ 0x2EFD80F1 }, - test_6_chksum_fx16{ 0xC5FBEE22 }, test_6_chksum_sa8{ 0x91229CCF }, - test_7_chksum_fx16{ 0x0C3944AD }, test_7_chksum_sa8{ 0x61E36677 }; +const crc32_calc test_1_chksum_fx16{ 0x92934920 }, test_1_chksum_sa8{ 0x50985847 }, + test_2_chksum_fx16{ 0x0C8AFCA5 }, test_2_chksum_sa8{ 0x6571F4D0 }, + test_3_chksum_fx16{ 0xD2F8214F }, test_3_chksum_sa8{ 0xDDE5DCC2 }, + test_4_chksum_fx16{ 0xBBED4B5D }, test_4_chksum_sa8{ 0x065E94FC }, + test_5_chksum_fx16{ 0x6BAA528A }, test_5_chksum_sa8{ 0x152D4871 }, + test_6_chksum_fx16{ 0xC5FBEE22 }, test_6_chksum_sa8{ 0xB9BE7962 }, + test_7_chksum_fx16{ 0x0C3944AD }, test_7_chksum_sa8{ 0xAAEDDE11 }; #else // Not defined CRC_*