Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AMD: speedup cryptonight_heavy division #2045

Merged
merged 1 commit into from
Nov 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;
const char *fastDivHeavyCL =
#include "./opencl/fast_div_heavy.cl"
;
const char *cryptonightCL =
#include "./opencl/cryptonight.cl"
;
Expand All @@ -924,6 +927,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)

std::string source_code(cryptonightCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_INT_MATH_V2"), fastIntMathV2CL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_FAST_DIV_HEAVY"), fastDivHeavyCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_AES"), wolfAesCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_WOLF_SKEIN"), wolfSkeinCL);
source_code = std::regex_replace(source_code, std::regex("XMRSTAK_INCLUDE_JH"), jhCL);
Expand Down
6 changes: 4 additions & 2 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ inline int amd_bfe(const uint src0, const uint offset, const uint width)

//#include "opencl/fast_int_math_v2.cl"
XMRSTAK_INCLUDE_FAST_INT_MATH_V2
//#include "fast_div_heavy.cl"
XMRSTAK_INCLUDE_FAST_DIV_HEAVY
//#include "opencl/wolf-aes.cl"
XMRSTAK_INCLUDE_WOLF_AES
//#include "opencl/wolf-skein.cl"
Expand Down Expand Up @@ -802,14 +804,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#if (ALGO == 4 || ALGO == 10)
long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
idx0 = (d ^ q) & MASK;
// cryptonight_haven
#elif (ALGO == 9)
long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
long q = n / (d | 0x5);
long q = fast_div_heavy(n, d | 0x5);
*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
idx0 = ((~d) ^ q) & MASK;
#endif
Expand Down
53 changes: 53 additions & 0 deletions xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
R"===(
#ifndef FAST_DIV_HEAVY_CL
#define FAST_DIV_HEAVY_CL

inline ulong get_reciprocal_heavy(uint a)
{
const uint shift = clz(a);
a <<= shift;

const float a_hi = as_float((a >> 8) + 1 + ((126U + 31U) << 23));
const float a_lo = convert_float_rte(as_int(a & 0xFF) - 256);

const float r = native_recip(a_hi);

const uint tmp0 = as_uint(r);
const uint tmp1 = tmp0 + ((shift + 2 + 64U) << 23);
const float r_scaled = as_float(tmp1);

const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));

const float r_scaled_hi = as_float(tmp1 & ~4095U);
const float h_hi = as_float(as_uint(h) & ~4095U);

const float r_scaled_lo = r_scaled - r_scaled_hi;
const float h_lo = h - h_hi;

const float x1 = h_hi * r_scaled_hi;
const float x2 = h_lo * r_scaled + h_hi * r_scaled_lo;

const long h1 = convert_long_rte(x1);
const int h2 = convert_int_rtp(x2) - convert_int_rtn(h * (x1 + x2));

const ulong result = tmp0 & 0xFFFFFF;
return (result << (shift + 9)) - ((h1 + h2) >> 2);
}

inline long fast_div_heavy(long _a, int _b)
{
const ulong a = abs(_a);
const uint b = abs(_b);
ulong q = mul_hi(a, get_reciprocal_heavy(b));

const long tmp = a - q * b;
const int overshoot = (tmp < 0) ? 1 : 0;
const int undershoot = (tmp >= b) ? 1 : 0;
q += undershoot - overshoot;

return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
}

#endif
)==="