fireice-uk · fireice-uk · Nov 11, 2018 · Nov 6, 2018
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -903,6 +903,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	const char *fastIntMathV2CL =
 			#include "./opencl/fast_int_math_v2.cl"
 	;
+    const char *fastDivHeavyCL =
+        #include "./opencl/fast_div_heavy.cl"
+    ;
 	const char *cryptonightCL =
 			#include "./opencl/cryptonight.cl"
 	;
@@ -924,6 +927,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	std::string source_code(cryptonightCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
+    source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
 	source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -80,6 +80,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)
 
 //#include "opencl/fast_int_math_v2.cl"
 XMRSTAK_INCLUDE_FAST_INT_MATH_V2
+//#include "fast_div_heavy.cl"
+XMRSTAK_INCLUDE_FAST_DIV_HEAVY
 //#include "opencl/wolf-aes.cl"
 XMRSTAK_INCLUDE_WOLF_AES
 //#include "opencl/wolf-skein.cl"
@@ -802,14 +804,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if (ALGO == 4 || ALGO == 10)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-			long q = n / (d | 0x5);
+			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = (d ^ q) & MASK;
 // cryptonight_haven
 #elif (ALGO == 9)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-			long q = n / (d | 0x5);
+			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
 			idx0 = ((~d) ^ q) & MASK;
 #endif

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -0,0 +1,53 @@
+R"===(
+#ifndef FAST_DIV_HEAVY_CL
+#define FAST_DIV_HEAVY_CL
+
+inline ulong get_reciprocal_heavy(uint a)
+{
+	const uint shift = clz(a);
+	a <<= shift;
+
+	const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
+	const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);
+
+	const float r = native_recip(a_hi);
+
+	const uint tmp0 = as_uint(r);
+	const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
+	const float r_scaled = as_float(tmp1);
+
+	const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
+
+	const float r_scaled_hi = as_float(tmp1 & ~4095U);
+	const float h_hi = as_float(as_uint(h) & ~4095U);
+
+	const float r_scaled_lo = r_scaled - r_scaled_hi;
+	const float h_lo = h - h_hi;
+
+	const float x1 = h_hi * r_scaled_hi;
+	const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;
+
+	const long h1 = convert_long_rte(x1);
+	const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));
+
+	const ulong result = tmp0 & 0xFFFFFF;
+	return (result << (shift + 9)) - ((h1 + h2) >> 2);
+}
+
+inline long fast_div_heavy(long _a, int _b)
+{
+	const ulong a = abs(_a);
+	const uint b = abs(_b);
+	ulong q = mul_hi(a, get_reciprocal_heavy(b));
+
+	const long tmp = a - q * b;
+	const int overshoot = (tmp < 0) ? 1 : 0;
+	const int undershoot = (tmp >= b) ? 1 : 0;
+	q += undershoot - overshoot;
+
+	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
+}
+
+#endif
+)==="
+