Skip to content

Commit

Permalink
use mlu_hi for leaky/prelu sa8
Browse files Browse the repository at this point in the history
  • Loading branch information
Hakim7267 authored and JaccovG committed Jun 29, 2021
1 parent f3bc362 commit 18ffe4c
Show file tree
Hide file tree
Showing 9 changed files with 162 additions and 115 deletions.
106 changes: 71 additions & 35 deletions lib/src/kernels/transform/impl/mli_krn_leaky_relu_ref.h
Expand Up @@ -62,17 +62,17 @@ static MLI_FORCE_INLINE void compute_leaky_relu(
int8_t input = vec_in[0];
int32_t output;
if (input >= in_zp) {
/* out_sa8 = (idendity_scale * in_sa8) * 2^(-(identity_shift)) + identity_offset */
output = mli_math_add_fx<int32_t>(
mli_math_asr_rnd_fx(
mli_math_mul_fx<int16_t, int32_t>(identity_params->scale, input),
identity_params->shift), identity_params->offset);
/* out_sa8 = (idendity_scale * (in_sa8 - in_zp)) * 2^(-(identity_shift)) + identity_offset */
int16_t input_sub = mli_math_sub_fx((int16_t)input, in_zp);
output = mli_math_asr_rnd_fx(
mli_math_mul_fx<int16_t, int32_t>(identity_params->scale, input_sub), identity_params->shift);
output = mli_math_add_fx(output, (int32_t)identity_params->offset);
} else {
/* out_sa8 = (alpha_scale * in_sa8) * 2^(-(alpha_shift)) + alpha_offset */
output = mli_math_add_fx<int32_t>(
mli_math_asr_rnd_fx(
mli_math_mul_fx<int16_t, int32_t>(alpha_params->scale, input),
alpha_params->shift), alpha_params->offset);
/* out_sa8 = (alpha_scale * (in_sa8 - in_zp)) * 2^(-(alpha_shift)) + alpha_offset */
int16_t input_sub = mli_math_sub_fx((int16_t)input, in_zp);
output = mli_math_asr_rnd_fx(
mli_math_mul_fx<int16_t, int32_t>(alpha_params->scale, input_sub), alpha_params->shift);
output = mli_math_add_fx(output, (int32_t)alpha_params->offset);
}

vec_out[0] = mli_math_cast_fx<int32_t, int8_t>(output, 0);
Expand Down Expand Up @@ -201,13 +201,64 @@ static MLI_FORCE_INLINE mli_status leaky_relu_fx_run(const mli_tensor *in,
return MLI_STATUS_OK;
}

static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(const mli_tensor *in,
static MLI_FORCE_INLINE void leaky_relu_define_identity_params(const mli_tensor *in, const mli_tensor *out,
s8asym_quant_params *params) {

/* ****************************************************************************************************************
* Mathematical Derivations out_sa8 Requantization Params to use with in_sa8
* ----------------------------------------------------------------------------------------------------------------
* out_sa8 = (in_scale_val/out_scale_val) *(in_sa8 - in_zp) + out_zp
* = scale_val * (in_sa8 - in_zp) + out_zp
* where:
*
* scale_val = in_scale_val / out_scale_val;
* = in_scale * 2^(-in_scale_frac_bits) / (out_scale * 2^(-out_scale_frac_bits));
* = (in_scale_val * 2^kPreDivShift / out_scale_val)
* * 2^(-(kPreDivShift + in_scale_frac_bits - out_scale_frac_bits));
* = (in_scale_val * 2^kPreDivShift / out_scale_val) * 2^(-norm_shift)
* * 2^(-(kPreDivShift + in_scale_frac_bits - out_scale_frac_bits - norm_shift));
* = (in_scale_val * 2^kPreDivShift / out_scale_val) * 2^(-norm_shift)
* * 2^(-scale_shift)
* = scale * 2 ^(-(scale_shift))
*
* where scale = (in_scale_val * 2^kPreDivShift / out_scale_val) * 2^(-norm_shift)
* scale_shift = kPreDivShift + in_scale_frac_bits - out_scale_frac_bits - norm_shift
* norm_shift is the shift value due to normalizing the result of
* (in_scale_val * 2^kPreDivShift / out_scale_val) and casting it from int32_t to int16_t
* kPreDivShift is derived from norm_32(in_scale) - norm_16(out_scale)
*
* offset = out_zp
*
* ***************************************************************************************************************/

int16_t scale_in = mli_hlp_tensor_scale(in, 0);
int16_t scale_out = mli_hlp_tensor_scale(out, 0);
int16_t out_zp = out->el_params.sa.zero_point.mem.i16;
int kPreDivShift = mli_math_norm_fx<int32_t, int32_t>(scale_in) -
mli_math_norm_fx<int16_t, int32_t>(scale_out);
/* Normalize In/Out Scale ratio and cast to 16bit */
int norm_shift;
params->scale = mli_math_norm_cast_fx<int32_t, int16_t>(
((int32_t)(scale_in) << kPreDivShift) /
scale_out, &norm_shift);

params->shift = kPreDivShift;
params->shift += mli_hlp_tensor_scale_shift(in, 0) - mli_hlp_tensor_scale_shift(out, 0);
params->shift -= norm_shift;
params->offset = out_zp;
int shift_left = mli_math_max_fx(-params->shift, 0);
params->scale = mli_math_asl_fx(params->scale, shift_left);
params->shift = mli_math_max_fx(params->shift, 0);


}
static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_alpha_params(const mli_tensor *in,
const mli_tensor *slope_coeff,
mli_tensor *out,
const int8_t alpha_sa8,
const s8asym_quant_params *identity_params) {

/* ****************************************************************************************************************
/* ****************************************************************************************************************
* Mathematical Derivations out_sa8 Requantization Params with alpha scale to use with in_sa8
* ----------------------------------------------------------------------------------------------------------------
* First we need to define Quantization Params for In/Out
Expand All @@ -220,8 +271,6 @@ static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(con
* out_sa8 = (in_scale_val/out_scale_val) * alpha_scale * (alpha_sa8 - alpha_zp) * (in_sa8 - in_zp) + out_zp
* = scale_val * alpha_val * (alpha_sa8 - alpha_zp) * (in_sa8 - in_zp) + out_zp
* = scale_alpha_val * (in_sa8 - in_zp) + out_zp
* = scale_alpha_val * in_sa8 + out_zp - scale_alpha_val * in_zp
* = scale_alpha_val * in_sa8 + scale_alpha_offset;
*
* For scale_alpha_val = scale_val * alpha_val * (alpha_sa8 - alpha_zp)
* = scale * 2 ^(-(scale_shift)) * alpha_scale
Expand All @@ -244,10 +293,10 @@ static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(con
* scale * alpha_scale * (alpha_sa8 - alpha_zp) * 2^(-(alpha_norm_shift))
* scale_alpha_shift = scale_shift + alpha_scale_frac_bits - alpha_norm_shift - scale_mul_norm_shift
*
* scale_alpha_offset = out_zp - scale_alpha_val * in_zp
* = out_zp - (scale_alpha * in_zp) * 2^(-(scale_alpha_shift));
* scale_alpha_offset = out_zp
*
* ***************************************************************************************************************/
int16_t out_zp = out->el_params.sa.zero_point.mem.i16;

int32_t alpha_val = mli_prv_convert_sa8_fx16<int8_t, int32_t>(alpha_sa8,
slope_coeff->el_params.sa.zero_point.mem.i16,
Expand All @@ -263,19 +312,12 @@ static MLI_FORCE_INLINE s8asym_quant_params leaky_relu_define_requant_params(con
int16_t scale_alpha = mli_math_norm_cast_fx<int32_t,int16_t>(
mli_math_mul_fx<int16_t, int32_t>(identity_params->scale, alpha), &norm_shift);
scale_alpha_shift -= norm_shift;

int16_t in_zp = in->el_params.sa.zero_point.mem.i16;
int16_t out_zp = out->el_params.sa.zero_point.mem.i16;

int16_t scale_alpha_offset = mli_math_sub_fx<int16_t>(out_zp,
mli_math_cast_fx<int32_t, int16_t>(
mli_math_mul_fx<int16_t, int32_t>(scale_alpha, in_zp), scale_alpha_shift));

/* Define Quantization params for (In * alpha / out) ratio */
s8asym_quant_params alpha_params;
alpha_params.scale = scale_alpha;
alpha_params.shift = scale_alpha_shift;
alpha_params.offset = scale_alpha_offset;
alpha_params.offset = out_zp;
return alpha_params;
}

Expand Down Expand Up @@ -303,22 +345,16 @@ static MLI_FORCE_INLINE mli_status leaky_relu_sa8_run(const mli_tensor *in,
* Mathematical Derivations for Leaky RELU SA8
* ----------------------------------------------------------------------------------------------------------------
* If (in_sa8 >= in_zp)
* out_sa8 = (idendity_scale * in_sa8) * 2^(-(identity_shift)) + identity_offset;
* out_sa8 = (idendity_scale * (in_sa8 - in_zp)) * 2^(-(identity_shift)) + identity_offset;
* else
* out_sa8 = (alpha_scale * in_sa8) * 2^(-(alpha_shift)) + alpha_offset;
* out_sa8 = (alpha_scale * (in_sa8 - in_zp)) * 2^(-(alpha_shift)) + alpha_offset;
*
* check leaky_relu_define_requant_params for more Documentation
* check leaky_relu_define_alpha_params for more Documentation
* ***************************************************************************************************************/
s8asym_quant_params identity_params;
/* Define Requantization Params for In/Out scale ratio */
define_requant_params(in, out, &identity_params);
int shift_left = mli_math_max_fx(-identity_params.shift, 0);
identity_params.scale = mli_math_asl_fx(identity_params.scale, shift_left);
identity_params.shift = mli_math_max_fx(identity_params.shift, 0);
s8asym_quant_params alpha_params = leaky_relu_define_requant_params(in, slope_coeff, out, scale, &identity_params);
shift_left = mli_math_max_fx(-alpha_params.shift, 0);
alpha_params.scale = mli_math_asl_fx(alpha_params.scale, shift_left);
alpha_params.shift = mli_math_max_fx(alpha_params.shift, 0);
leaky_relu_define_identity_params(in, out, &identity_params);
s8asym_quant_params alpha_params = leaky_relu_define_alpha_params(in, slope_coeff, out, scale, &identity_params);

/* Dummy Load to get num_lanes */
auto input = mli_prv_load_1vec(in_ptr);
Expand Down
50 changes: 33 additions & 17 deletions lib/src/kernels/transform/impl/mli_krn_leaky_relu_vdsp.h
Expand Up @@ -43,8 +43,10 @@ static MLI_FORCE_INLINE vNx2short_t calc_leaky_relu(
pvNx2 sel = init_predicate(input > 0);
vNx2short_t neg;
if ( shift > mul_hi_shift) {
constexpr int max_shift_right = 15;
int shift_right = mli_math_min_fx(shift - mul_hi_shift, max_shift_right);
neg = mli_math_mul_fx_high(input, scale);
neg = mli_math_asr_rnd_fx(neg, shift - mul_hi_shift);
neg = mli_math_asr_rnd_fx(neg, shift_right);
} else {
vNx2accint_t acc = mli_math_mul_fx<vNx2short_t, vNx2accint_t>(input, scale);
neg = mli_math_acc_cast_fx<vNx2short_t, vNx2accint_t>(acc, shift);
Expand Down Expand Up @@ -110,33 +112,47 @@ static MLI_FORCE_INLINE vNx4char_t calc_leaky_relu(
/* Load Input */
vNx4char_t input = mli_prv_load_1vec(vec_in);
vNx4short_t input_cast = mli_math_cast_fx<vNx4char_t, vNx4short_t>(input);
input_cast = mli_math_sub(input_cast, in_zp);
pvNx4 select = init_predicate(input >= in_zp);
/*
* shifting more than 24 is not needed
* as the scaled result = ((input - in_offset) * scale) will be limited by 24 bits.
*/
constexpr int max_shift = 24;
constexpr int mul_hi_shift = 16;

int identity_shift = mli_math_min_fx(identity_params->shift, max_shift);
identity_shift -= mul_hi_shift;
int shift_left = mli_math_max_fx(1 - identity_shift, 0);
int shift_right = mli_math_max_fx(identity_shift, 1);

int identity_shift = identity_params->shift;
int identity_offset = (int)identity_params->offset << identity_shift;
int16_t identity_offset = identity_params->offset << shift_right;
#ifdef ROUND_UP
identity_offset += (int)(((uint32_t)1 << identity_shift) >> 1);
identity_offset += (int16_t)(((uint16_t)1 << shift_right) >> 1);
#else
#error Rounding mode not supported
#endif
vNx4int_t input_identity_scale = mli_math_mul_fx<vNx4short_t, vNx4int_t>(input_cast, identity_params->scale);
input_identity_scale = mli_math_add(input_identity_scale, identity_offset);
input_identity_scale = mli_math_asr_fx(input_identity_scale, identity_shift);
vNx4short_t input_identity_cast = mli_math_asl_fx(input_cast, shift_left);
vNx4short_t input_identity_scale = mli_math_mul_fx_high(input_identity_cast, identity_params->scale);
input_identity_scale = mli_math_add_fx(input_identity_scale, (vNx4short_t)identity_offset);
vNx4char_t output_identity = mli_math_cast_fx<vNx4short_t, vNx4char_t, false>(input_identity_scale, shift_right);

vNx4char_t output_identity = mli_math_cast_fx<vNx4int_t, vNx4char_t>(input_identity_scale);
int alpha_shift = mli_math_min_fx(alpha_params->shift, max_shift);
alpha_shift -= mul_hi_shift;
shift_left = mli_math_max_fx(1 - alpha_shift, 0);
shift_right = mli_math_max_fx(alpha_shift, 1);

int alpha_shift = alpha_params->shift;
int alpha_offset = (int)alpha_params->offset << alpha_shift;
int16_t alpha_offset = alpha_params->offset << shift_right;
#ifdef ROUND_UP
alpha_offset += (int)(((uint32_t)1 << alpha_shift) >> 1);
alpha_offset += (int16_t)(((uint16_t)1 << shift_right) >> 1);
#else
#error Rounding mode not supported
#endif
vNx4int_t input_alpha_scale = mli_math_mul_fx<vNx4short_t, vNx4int_t>(input_cast, alpha_params->scale);
input_alpha_scale = mli_math_add(input_alpha_scale, alpha_offset);
input_alpha_scale = mli_math_asr_fx(input_alpha_scale, alpha_shift);
vNx4char_t output_alpha = mli_math_cast_fx<vNx4int_t, vNx4char_t>(input_alpha_scale);
vNx4short_t input_alpha_cast = mli_math_asl_fx(input_cast, shift_left);
vNx4short_t input_alpha_scale = mli_math_mul_fx_high(input_alpha_cast, alpha_params->scale);
input_alpha_scale = mli_math_add_fx(input_alpha_scale, (vNx4short_t)alpha_offset);

vNx4char_t output_alpha = mli_math_cast_fx<vNx4short_t, vNx4char_t, false>(input_alpha_scale, shift_right);
return mli_math_select_fx(select, output_identity, output_alpha);
}

Expand Down Expand Up @@ -182,7 +198,7 @@ static MLI_FORCE_INLINE void compute_leaky_relu_sa8_inner_loop(
vec_out += remaining_part;
}

#pragma clang loop unroll_count(2)
#pragma clang loop unroll_count(4)
for (int pos3 = remaining_part; pos3 < count; pos3 += num_lanes) {
compute_leaky_relu(vec_in, vec_out, in_zp, identity_params, alpha_params);
vec_in += num_lanes;
Expand Down
6 changes: 3 additions & 3 deletions lib/src/kernels/transform/impl/mli_krn_prelu_dsp.h
Expand Up @@ -60,15 +60,15 @@ static MLI_FORCE_INLINE void compute_prelu(
mli_prv_store_1_sample(vec_out, calc_prelu(input, scale, shift));
}

static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_params(const mli_tensor *in,
static MLI_FORCE_INLINE s8asym_quant_params_v prelu_define_requant_alpha_params(const mli_tensor *in,
const mli_tensor *slope_coeff,
mli_tensor *out,
const v2q15_t alpha_sa8,
const s8asym_quant_params *identity_params) {

s8asym_quant_params scale0 = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out,
s8asym_quant_params scale0 = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out,
alpha_sa8[0], identity_params);
s8asym_quant_params scale1 = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out,
s8asym_quant_params scale1 = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out,
alpha_sa8[1], identity_params);
s8asym_quant_params_v alpha_params;
alpha_params.scale = mli_prv_init_v(scale0.scale, scale1.scale );
Expand Down
17 changes: 7 additions & 10 deletions lib/src/kernels/transform/impl/mli_krn_prelu_ref.h
Expand Up @@ -210,13 +210,13 @@ static MLI_FORCE_INLINE mli_status prelu_fx_run(const mli_tensor *in,
return MLI_STATUS_OK;
}

static MLI_FORCE_INLINE s8asym_quant_params prelu_define_requant_params(const mli_tensor *in,
static MLI_FORCE_INLINE s8asym_quant_params prelu_define_requant_alpha_params(const mli_tensor *in,
const mli_tensor *slope_coeff,
mli_tensor *out,
const int8_t alpha_sa8,
const s8asym_quant_params *identity_params) {

return mli::krn::ref::leaky_relu_define_requant_params(in, slope_coeff, out, alpha_sa8, identity_params);
return mli::krn::ref::leaky_relu_define_alpha_params(in, slope_coeff, out, alpha_sa8, identity_params);
}

static MLI_FORCE_INLINE void compute_prelu_broadcast(
Expand Down Expand Up @@ -252,7 +252,7 @@ static MLI_FORCE_INLINE void compute_prelu_broadcast(
const MLI_PTR(int8_t) vec_in = (MLI_PTR(int8_t))in_prv.ptr + scale_idx * axis_in_mem_stride;
MLI_OUT_PTR(int8_t) vec_out = out_prv.ptr + scale_idx * axis_out_mem_stride;
/* Load Scale Elem */
auto alpha_params = mli::krn::ref::prelu_define_requant_params(in, slope_coeff, out,
auto alpha_params = mli::krn::ref::prelu_define_requant_alpha_params(in, slope_coeff, out,
slope_ptr[scale_idx], identity_params);

/* Loop Over Sub Tensor */
Expand Down Expand Up @@ -346,14 +346,11 @@ static MLI_FORCE_INLINE mli_status prelu_sa8_run(const mli_tensor *in,
* else
* out_sa8 = (alpha_scale * in_sa8) * 2^(-(alpha_shift)) + alpha_offset;
*
* check prelu_define_requant_params for more Documentation
* check prelu_define_requant_alpha_params for more Documentation
* ***************************************************************************************************************/
s8asym_quant_params identity_params;
/* Define Requantization Params for In/Out scale ratio */
define_requant_params(in, out, &identity_params);
int shift_left = mli_math_max_fx(-identity_params.shift, 0);
identity_params.scale = mli_math_asl_fx(identity_params.scale, shift_left);
identity_params.shift = mli_math_max_fx(identity_params.shift, 0);
leaky_relu_define_identity_params(in, out, &identity_params);

/* Input Zero Point */
int16_t in_zp = in->el_params.sa.zero_point.mem.i16;
Expand All @@ -376,7 +373,7 @@ static MLI_FORCE_INLINE mli_status prelu_sa8_run(const mli_tensor *in,
if (remaining_part) {
/* Load Scale Vector */
auto scale_v = mli_prv_load_1vec(slope_ptr);
auto alpha_params = mli::krn::prelu_define_requant_params(in, slope_coeff, out, scale_v, &identity_params);
auto alpha_params = mli::krn::prelu_define_requant_alpha_params(in, slope_coeff, out, scale_v, &identity_params);

mli::krn::compute_prelu_no_broadcast(in_ptr, out_ptr, in_zp, &identity_params, &alpha_params,
in_prv, out_prv, remaining_part);
Expand All @@ -387,7 +384,7 @@ static MLI_FORCE_INLINE mli_status prelu_sa8_run(const mli_tensor *in,
vec_out = out_ptr + scale_idx * axis_out_mem_stride;
/* Load Scale Vector */
auto scale_v = mli_prv_load_1vec(&slope_ptr[scale_idx]);
auto alpha_params = mli::krn::prelu_define_requant_params(in, slope_coeff, out, scale_v, &identity_params);
auto alpha_params = mli::krn::prelu_define_requant_alpha_params(in, slope_coeff, out, scale_v, &identity_params);

mli::krn::compute_prelu_no_broadcast(vec_in, vec_out, in_zp, &identity_params, &alpha_params,
in_prv, out_prv);
Expand Down

0 comments on commit 18ffe4c

Please sign in to comment.