Improves some function performance

For complex ?:-operators, clang generates if-else blocks, e.g. if clang cannot determine that neither side has side-effects. By moving some calculations out of the ?:-operator, we allow clang to use a select-instruction saving instructions and often also execution cycles for either case. Fixes implementation of sqrt and native_sqrt
doe300 · Jul 14, 2019 · ed74948 · ed74948
1 parent d1fb2f8
commit ed74948
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 18 deletions.
diff --git a/include/_conversions.h b/include/_conversions.h
@@ -45,6 +45,7 @@
   /* special case for uint as source type */ \
   (uint)CC(srcType,_MAX) == (uint) UINT_MAX ? \
     /* need to remove high-bit (sign-bit) */ \
+    /* TODO this converts outer ?:-operator to if-else block */ \
     vc4cl_bitcast_int(clamp(vc4cl_bitcast_uint(vc4cl_extend(val)) > (uint##num)0x7FFFFFFF ? (int##num)0x7FFFFFFF : vc4cl_bitcast_int(vc4cl_extend(val)), (int##num)destType##_MIN, (int##num)destType##_MAX)) : \
   vc4cl_bitcast_int(clamp(vc4cl_bitcast_int(vc4cl_extend(val)), (int##num)destType##_MIN, (int##num)destType##_MAX)) \
 
@@ -109,7 +110,7 @@
 */
 #define CONVERT_FLOAT_TO_INTEGER(destType, saturation, rounding) \
         INLINE destType convert_##destType##saturation##rounding(float val) OVERLOADABLE CONST \
-        { \
+        { /* TODO This converts thefirst ?:-operator to an if-else block, but only for scalar conversion  (no vector if-else!) */ \
             int saturatedInt = val >= 2147483648.0f ? 0x7FFFFFFF : val <= -2147483648.0f ? 0x80000000 : vc4cl_ftoi(ROUND_TO_INTEGER(rounding, val)); \
             return vc4cl_bitcast_##destType(CONVERSION_WITH_SATURATION(destType, int, /* scalar */, saturation, saturatedInt)); \
         } \
@@ -198,8 +199,10 @@
 #define CONVERT_UINT_TO_FLOAT(saturation, rounding) \
         INLINE float convert_float##saturation##rounding(uint val) OVERLOADABLE CONST \
         { \
-            /* For the rounding mode: find */ \
-            return vc4cl_msb_set(val) ? vc4cl_itof(vc4cl_bitcast_int(val >> 1)) * 2.0f : vc4cl_itof(vc4cl_bitcast_int(val)); \
+            /* Calculate both variants explicitly to not generate an if-else block for the more complicated ?:-operator */ \
+            float upper = vc4cl_itof(vc4cl_bitcast_int(val >> 1)) * 2.0f; \
+            float lower = vc4cl_itof(vc4cl_bitcast_int(val)); \
+            return vc4cl_msb_set(val) ? upper : lower; \
         } \
         INLINE float##2 convert_float2##saturation##rounding(uint2 val) OVERLOADABLE CONST \
         { \

diff --git a/include/_integer.h b/include/_integer.h
@@ -38,11 +38,26 @@ SIMPLE_1(uint, abs, uint, val, val)
 
 //based on pocl (pocl/lib/kernel/abs_diff.cl)
 SIMPLE_2(uchar, abs_diff, uchar, x, uchar, y, (result_t)abs(x > y ? x - y : y - x))
-SIMPLE_2(uchar, abs_diff, char, x, char, y, (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ (result_t)abs(x - y) : /* different signs */ abs(x) + abs(y))
+COMPLEX_2(uchar, abs_diff, char, x, char, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = (result_t)abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
 SIMPLE_2(ushort, abs_diff, ushort, x, ushort, y, (result_t)abs(x > y ? x - y : y - x))
-SIMPLE_2(ushort, abs_diff, short, x, short, y, (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ (result_t)abs(x - y) : /* different signs */ abs(x) + abs(y))
+COMPLEX_2(ushort, abs_diff, short, x, short, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = (result_t)abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
 SIMPLE_2(uint, abs_diff, uint, x, uint, y, abs(x > y ? x - y : y - x))
-SIMPLE_2(uint, abs_diff, int, x, int, y, (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ abs(x - y) : /* different signs */ abs(x) + abs(y))
+COMPLEX_2(uint, abs_diff, int, x, int, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t noflow = abs(x - y);
+	result_t flow = abs(x) + abs(y);
+	return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
+})
 
 SIMPLE_2(uchar, add_sat, uchar, x, uchar, y, vc4cl_v8adds(x, y))
 SIMPLE_2(char, add_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) + vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))

diff --git a/include/_math.h b/include/_math.h
@@ -713,6 +713,7 @@ COMPLEX_1(int, ilogb, float, x, {
 //"Multiply x by 2 to the power k."
 // TODO rewrite, use bit-trickery: x * 2^k = Mx * 2^(Ex + k)
 // TODO this version is wrong for |exponents > 31|
+// TODO have a look at https://gitlab.freedesktop.org/anholt/mesa/blob/f73a16727358c943dec239725a1bf2335f611b6a/src/compiler/nir/nir_opt_algebraic.py#L800
 SIMPLE_2(
 	float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof((arg1_t)(1) << k) : 1.0f / vc4cl_itof((arg1_t)(1) << -k)))
 SIMPLE_2_SCALAR(float, ldexp, float, x, int, k, x *(k >= 0 ? vc4cl_itof(1 << k) : 1.0f / vc4cl_itof(1 << -k)))
@@ -975,9 +976,19 @@ SIMPLE_1(float, logb, float, x, vc4cl_itof(ilogb(x)))
 SIMPLE_3(float, mad, float, a, float, b, float, c, (a * b) + c)
 
 //"Returns x if |x|>|y|, y if |y|>|x|, otherwise fmax(x, y)"
-SIMPLE_2(float, maxmag, float, x, float, y, fabs(x) > fabs(y) ? x : (fabs(y) > fabs(x) ? y : fmax(x, y)))
+COMPLEX_2(float, maxmag, float, x, float, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t tmp = fmax(x, y);
+	result_t other = fabs(y) > fabs(x) ? y : tmp;
+	return fabs(x) > fabs(y) ? x : other;
+})
 //"Returns x if |x|<|y|, y if |y|<|x|, otherwise fmin(x, y)"
-SIMPLE_2(float, minmag, float, x, float, y, fabs(x) < fabs(y) ? x : (fabs(y) < fabs(x) ? y : fmin(x, y)))
+COMPLEX_2(float, minmag, float, x, float, y, {
+	// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
+	result_t tmp = fmin(x, y);
+	result_t other = fabs(y) < fabs(x) ? y : tmp;
+	return fabs(x) < fabs(y) ? x : other;
+})
 
 /**
  * Expected behavior:
@@ -999,15 +1010,14 @@ SIMPLE_1(float, nan, uint, nancode, vc4cl_bitcast_float(NAN | nancode))
  * nextafter(0, y < 0) = smallest negative denormal value
  */
 COMPLEX_2(float, nextafter, float, x, float, y, {
-	// TODO correct??
 	int_t ix = vc4cl_bitcast_int(x);
 	int_t iy = vc4cl_bitcast_int(y);
-	int_t res = x == y ? iy :
-						 ix >= 0 ?		/* x > 0 */
-			(ix > iy ? ix - 1 : ix + 1) /* x > y -> x -= ulp otherwise x += ulp */
-			/* x < 0 */
-			:
-			(iy > 0 || ix > iy ? ix - 1 : ix + 1); /* x < y -> x -= ulp otherwise x += ulp */
+	/* x > y -> x -= ulp otherwise x += ulp */
+	int_t xPos = ix > iy ? ix - 1 : ix + 1;
+	/* x < y -> x -= ulp otherwise x += ulp */
+	int_t xNeg = iy > 0 || ix > iy ? ix - 1 : ix + 1;
+	int_t xNotY = ix >= 0 ? /* x > 0 */ xPos : /* x < 0 */ xNeg;
+	int_t res = x == y ? iy : xNotY;
 	return vc4cl_bitcast_float(res);
 })
 
@@ -1035,7 +1045,10 @@ COMPLEX_2(float, nextafter, float, x, float, y, {
  * pow(+-0, -Inf) =+Inf
  */
 // for pow, see also https://stackoverflow.com/questions/4518011/algorithm-for-powfloat-float
-SIMPLE_2(float, pow, float, x, float, y, y < 0.0f ? (result_t) 1.0f / powr(x, y) : powr(x, y));
+COMPLEX_2(float, pow, float, x, float, y, {
+	result_t tmp = powr(x, y);
+	return y < 0.0f ? (result_t) 1.0f / tmp : tmp;
+})
 
 /**
  * Expected behavior:
@@ -1046,6 +1059,7 @@ SIMPLE_2(float, pow, float, x, float, y, y < 0.0f ? (result_t) 1.0f / powr(x, y)
  * pown(+-0, n) = +-0 for odd n and n > 0
  * pown(+-0, n) = 0 for even n and n > 0
  */
+ // TODO is there a way to not need to calculate both versions? Or at least calculate both at once?!
 SIMPLE_2(float, pown, float, x, int, n,
 	n < 0 ? (1.0f / fast_pown(x, vc4cl_bitcast_uint(-n), 32)) : fast_pown(x, vc4cl_bitcast_uint(n), 32))
 
@@ -1182,7 +1196,6 @@ COMPLEX_1(float, rsqrt, float, x, {
 	u.x = u.x * (1.5f - xhalf * u.x * u.x);
 	u.x = u.x * (1.5f - xhalf * u.x * u.x);
 	u.x = u.x * (1.5f - xhalf * u.x * u.x);
-	// TODO see how many iterations we need
 	return u.x;
 })
 
@@ -1337,6 +1350,7 @@ COMPLEX_1(float, sinh, float, val, {
 SIMPLE_1(float, sinpi, float, val, sin(val *M_PI_F))
 
 COMPLEX_1(float, sqrt, float, val, {
+	/*
 	// comparison of 14 algorithms:
 	// https://www.codeproject.com/articles/69941/best-square-root-method-algorithm-function-precisi
 	// https://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Taylor_series
@@ -1359,6 +1373,11 @@ COMPLEX_1(float, sqrt, float, val, {
 	// of 4.7*10-7 from the OpenCL standard
 
 	return val == 0.0f ? (result_t) 0.0f : x;
+	*/
+
+	// The above algorithm is too inaccurate, but the following line is accurate enough and not too expensive to calculate
+	// sqrt(x) = x / sqrt(x) = x * rsqrt(x)
+	return val == 0.0f ? (result_t) 0.0f : (val * rsqrt(val));
 })
 
 /**
@@ -1455,7 +1474,7 @@ SIMPLE_1(float, native_rsqrt, float, val, vc4cl_sfu_rsqrt(val))
 
 SIMPLE_1(float, native_sin, float, val, sin(val))
 
-SIMPLE_1(float, native_sqrt, float, val, native_recip(native_rsqrt(val)))
+SIMPLE_1(float, native_sqrt, float, val, val * native_rsqrt(val))
 
 SIMPLE_1(float, native_tan, float, val, tan(val))