Skip to content

Commit

Permalink
Improves some function performance
Browse files Browse the repository at this point in the history
For complex ?:-operators, clang generates if-else blocks,
e.g. if clang cannot determine that neither side has side-effects.
By moving some calculations out of the ?:-operator, we allow clang
to use a select-instruction saving instructions and often also
execution cycles for either case.

Fixes implementation of sqrt and native_sqrt
  • Loading branch information
doe300 committed Jul 14, 2019
1 parent d1fb2f8 commit ed74948
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 18 deletions.
9 changes: 6 additions & 3 deletions include/_conversions.h
Expand Up @@ -45,6 +45,7 @@
/* special case for uint as source type */ \
(uint)CC(srcType,_MAX) == (uint) UINT_MAX ? \
/* need to remove high-bit (sign-bit) */ \
/* TODO this converts outer ?:-operator to if-else block */ \
vc4cl_bitcast_int(clamp(vc4cl_bitcast_uint(vc4cl_extend(val)) > (uint##num)0x7FFFFFFF ? (int##num)0x7FFFFFFF : vc4cl_bitcast_int(vc4cl_extend(val)), (int##num)destType##_MIN, (int##num)destType##_MAX)) : \
vc4cl_bitcast_int(clamp(vc4cl_bitcast_int(vc4cl_extend(val)), (int##num)destType##_MIN, (int##num)destType##_MAX)) \

Expand Down Expand Up @@ -109,7 +110,7 @@
*/
#define CONVERT_FLOAT_TO_INTEGER(destType, saturation, rounding) \
INLINE destType convert_##destType##saturation##rounding(float val) OVERLOADABLE CONST \
{ \
{ /* TODO This converts thefirst ?:-operator to an if-else block, but only for scalar conversion (no vector if-else!) */ \
int saturatedInt = val >= 2147483648.0f ? 0x7FFFFFFF : val <= -2147483648.0f ? 0x80000000 : vc4cl_ftoi(ROUND_TO_INTEGER(rounding, val)); \
return vc4cl_bitcast_##destType(CONVERSION_WITH_SATURATION(destType, int, /* scalar */, saturation, saturatedInt)); \
} \
Expand Down Expand Up @@ -198,8 +199,10 @@
#define CONVERT_UINT_TO_FLOAT(saturation, rounding) \
INLINE float convert_float##saturation##rounding(uint val) OVERLOADABLE CONST \
{ \
/* For the rounding mode: find */ \
return vc4cl_msb_set(val) ? vc4cl_itof(vc4cl_bitcast_int(val >> 1)) * 2.0f : vc4cl_itof(vc4cl_bitcast_int(val)); \
/* Calculate both variants explicitly to not generate an if-else block for the more complicated ?:-operator */ \
float upper = vc4cl_itof(vc4cl_bitcast_int(val >> 1)) * 2.0f; \
float lower = vc4cl_itof(vc4cl_bitcast_int(val)); \
return vc4cl_msb_set(val) ? upper : lower; \
} \
INLINE float##2 convert_float2##saturation##rounding(uint2 val) OVERLOADABLE CONST \
{ \
Expand Down
21 changes: 18 additions & 3 deletions include/_integer.h
Expand Up @@ -38,11 +38,26 @@ SIMPLE_1(uint, abs, uint, val, val)

//based on pocl (pocl/lib/kernel/abs_diff.cl)
SIMPLE_2(uchar, abs_diff, uchar, x, uchar, y, (result_t)abs(x > y ? x - y : y - x))
SIMPLE_2(uchar, abs_diff, char, x, char, y, (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ (result_t)abs(x - y) : /* different signs */ abs(x) + abs(y))
COMPLEX_2(uchar, abs_diff, char, x, char, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = (result_t)abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})
SIMPLE_2(ushort, abs_diff, ushort, x, ushort, y, (result_t)abs(x > y ? x - y : y - x))
SIMPLE_2(ushort, abs_diff, short, x, short, y, (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ (result_t)abs(x - y) : /* different signs */ abs(x) + abs(y))
COMPLEX_2(ushort, abs_diff, short, x, short, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = (result_t)abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})
SIMPLE_2(uint, abs_diff, uint, x, uint, y, abs(x > y ? x - y : y - x))
SIMPLE_2(uint, abs_diff, int, x, int, y, (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ abs(x - y) : /* different signs */ abs(x) + abs(y))
COMPLEX_2(uint, abs_diff, int, x, int, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t noflow = abs(x - y);
result_t flow = abs(x) + abs(y);
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
})

SIMPLE_2(uchar, add_sat, uchar, x, uchar, y, vc4cl_v8adds(x, y))
SIMPLE_2(char, add_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) + vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
Expand Down
43 changes: 31 additions & 12 deletions include/_math.h
Expand Up @@ -713,6 +713,7 @@ COMPLEX_1(int, ilogb, float, x, {
//"Multiply x by 2 to the power k."
// TODO rewrite, use bit-trickery: x * 2^k = Mx * 2^(Ex + k)
// TODO this version is wrong for |exponents > 31|
// TODO have a look at https://gitlab.freedesktop.org/anholt/mesa/blob/f73a16727358c943dec239725a1bf2335f611b6a/src/compiler/nir/nir_opt_algebraic.py#L800
SIMPLE_2(
float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof((arg1_t)(1) << k) : 1.0f / vc4cl_itof((arg1_t)(1) << -k)))
SIMPLE_2_SCALAR(float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof(1 << k) : 1.0f / vc4cl_itof(1 << -k)))
Expand Down Expand Up @@ -975,9 +976,19 @@ SIMPLE_1(float, logb, float, x, vc4cl_itof(ilogb(x)))
SIMPLE_3(float, mad, float, a, float, b, float, c, (a * b) + c)

//"Returns x if |x|>|y|, y if |y|>|x|, otherwise fmax(x, y)"
SIMPLE_2(float, maxmag, float, x, float, y, fabs(x) > fabs(y) ? x : (fabs(y) > fabs(x) ? y : fmax(x, y)))
COMPLEX_2(float, maxmag, float, x, float, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t tmp = fmax(x, y);
result_t other = fabs(y) > fabs(x) ? y : tmp;
return fabs(x) > fabs(y) ? x : other;
})
//"Returns x if |x|<|y|, y if |y|<|x|, otherwise fmin(x, y)"
SIMPLE_2(float, minmag, float, x, float, y, fabs(x) < fabs(y) ? x : (fabs(y) < fabs(x) ? y : fmin(x, y)))
COMPLEX_2(float, minmag, float, x, float, y, {
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
result_t tmp = fmin(x, y);
result_t other = fabs(y) < fabs(x) ? y : tmp;
return fabs(x) < fabs(y) ? x : other;
})

/**
* Expected behavior:
Expand All @@ -999,15 +1010,14 @@ SIMPLE_1(float, nan, uint, nancode, vc4cl_bitcast_float(NAN | nancode))
* nextafter(0, y < 0) = smallest negative denormal value
*/
COMPLEX_2(float, nextafter, float, x, float, y, {
// TODO correct??
int_t ix = vc4cl_bitcast_int(x);
int_t iy = vc4cl_bitcast_int(y);
int_t res = x == y ? iy :
ix >= 0 ? /* x > 0 */
(ix > iy ? ix - 1 : ix + 1) /* x > y -> x -= ulp otherwise x += ulp */
/* x < 0 */
:
(iy > 0 || ix > iy ? ix - 1 : ix + 1); /* x < y -> x -= ulp otherwise x += ulp */
/* x > y -> x -= ulp otherwise x += ulp */
int_t xPos = ix > iy ? ix - 1 : ix + 1;
/* x < y -> x -= ulp otherwise x += ulp */
int_t xNeg = iy > 0 || ix > iy ? ix - 1 : ix + 1;
int_t xNotY = ix >= 0 ? /* x > 0 */ xPos : /* x < 0 */ xNeg;
int_t res = x == y ? iy : xNotY;
return vc4cl_bitcast_float(res);
})

Expand Down Expand Up @@ -1035,7 +1045,10 @@ COMPLEX_2(float, nextafter, float, x, float, y, {
* pow(+-0, -Inf) =+Inf
*/
// for pow, see also https://stackoverflow.com/questions/4518011/algorithm-for-powfloat-float
SIMPLE_2(float, pow, float, x, float, y, y < 0.0f ? (result_t) 1.0f / powr(x, y) : powr(x, y));
COMPLEX_2(float, pow, float, x, float, y, {
result_t tmp = powr(x, y);
return y < 0.0f ? (result_t) 1.0f / tmp : tmp;
})

/**
* Expected behavior:
Expand All @@ -1046,6 +1059,7 @@ SIMPLE_2(float, pow, float, x, float, y, y < 0.0f ? (result_t) 1.0f / powr(x, y)
* pown(+-0, n) = +-0 for odd n and n > 0
* pown(+-0, n) = 0 for even n and n > 0
*/
// TODO is there a way to not need to calculate both versions? Or at least calculate both at once?!
SIMPLE_2(float, pown, float, x, int, n,
n < 0 ? (1.0f / fast_pown(x, vc4cl_bitcast_uint(-n), 32)) : fast_pown(x, vc4cl_bitcast_uint(n), 32))

Expand Down Expand Up @@ -1182,7 +1196,6 @@ COMPLEX_1(float, rsqrt, float, x, {
u.x = u.x * (1.5f - xhalf * u.x * u.x);
u.x = u.x * (1.5f - xhalf * u.x * u.x);
u.x = u.x * (1.5f - xhalf * u.x * u.x);
// TODO see how many iterations we need
return u.x;
})

Expand Down Expand Up @@ -1337,6 +1350,7 @@ COMPLEX_1(float, sinh, float, val, {
SIMPLE_1(float, sinpi, float, val, sin(val *M_PI_F))

COMPLEX_1(float, sqrt, float, val, {
/*
// comparison of 14 algorithms:
// https://www.codeproject.com/articles/69941/best-square-root-method-algorithm-function-precisi
// https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Taylor_series
Expand All @@ -1359,6 +1373,11 @@ COMPLEX_1(float, sqrt, float, val, {
// of 4.7*10-7 from the OpenCL standard
return val == 0.0f ? (result_t) 0.0f : x;
*/

// The above algorithm is too inaccurate, but the following line is accurate enough and not too expensive to calculate
// sqrt(x) = x / sqrt(x) = x * rsqrt(x)
return val == 0.0f ? (result_t) 0.0f : (val * rsqrt(val));
})

/**
Expand Down Expand Up @@ -1455,7 +1474,7 @@ SIMPLE_1(float, native_rsqrt, float, val, vc4cl_sfu_rsqrt(val))

SIMPLE_1(float, native_sin, float, val, sin(val))

SIMPLE_1(float, native_sqrt, float, val, native_recip(native_rsqrt(val)))
SIMPLE_1(float, native_sqrt, float, val, val * native_rsqrt(val))

SIMPLE_1(float, native_tan, float, val, tan(val))

Expand Down

0 comments on commit ed74948

Please sign in to comment.