From 4457b607000635c2059d85f85530e65d4f333383 Mon Sep 17 00:00:00 2001 From: EDDragonWolf Date: Fri, 22 Feb 2019 10:49:43 -0800 Subject: [PATCH 1/4] Added support of CryptoNight v8 ReverseWaltz (aka cryptonight_v8_reversewaltz, reverse_waltz or graft) --- README.md | 1 + xmrstak/backend/amd/amd_gpu/gpu.cpp | 2 +- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 37 ++++++++++---- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 2 +- .../backend/cpu/crypto/cryptonight_aesni.h | 30 ++++++++++-- xmrstak/backend/cpu/minethd.cpp | 21 +++++++- xmrstak/backend/cryptonight.hpp | 13 +++-- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 48 ++++++++++++++++--- .../backend/nvidia/nvcc_code/cuda_extra.cu | 13 +++-- xmrstak/jconf.cpp | 6 ++- xmrstak/pools.tpl | 1 + 11 files changed, 141 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index ff87dcead..c6bbfdf00 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ If your prefered coin is not listed, you can choose one of the following algorit - cryptonight_v8 - cryptonight_v8_half (used by masari and stellite) - cryptonight_v8_zelerius + - cryptonight_v8_reversewaltz (used by graft) - 4MiB scratchpad memory - cryptonight_haven - cryptonight_heavy diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 8713784c2..a47c1015f 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -430,7 +430,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ * this is required if the dev pool is mining monero * but the user tuned there settings for another currency */ - if(miner_algo == cryptonight_monero_v8) + if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_v8_reversewaltz) { if(ctx->memChunk < 2) mem_chunk_exp = 1u << 2; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 2ca09c31c..395101b2b 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -30,6 +30,7 @@ R"===( #define cryptonight_superfast 12 #define cryptonight_gpu 13 #define cryptonight_conceal 14 +#define cryptonight_v8_reversewaltz 15 /* For Mesa clover support */ #ifdef cl_clang_storage_class_specifiers @@ -639,7 +640,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, R"===( // __NV_CL_C_VERSION checks if NVIDIA opencl is used -#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) # define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4))))) #else @@ -659,7 +660,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states float4 conc_var = (float4)(0.0f); #endif -#if(ALGO == cryptonight_monero_v8) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) ulong b[4]; uint4 b_x[2]; // NVIDIA @@ -673,7 +674,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif __local uint AES0[256], AES1[256]; -#if(ALGO == cryptonight_monero_v8) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) # if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; # endif @@ -689,7 +690,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); -#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } @@ -723,7 +724,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[0] = ((uint4 *)b)[0]; -#if(ALGO == cryptonight_monero_v8) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) a[1] = states[1] ^ states[5]; b[2] = states[8] ^ states[10]; b[3] = states[9] ^ states[11]; @@ -755,7 +756,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states { ulong c[2]; -#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION)) +#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION)) uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif @@ -794,6 +795,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states } #endif +#if(ALGO == cryptonight_v8_reversewaltz) + { + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); + } +#endif + #if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2) uint table = 0x75310U; b_x[0] ^= ((uint4 *)c)[0]; @@ -807,7 +819,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states SCRATCHPAD_CHUNK(0) = b_x[0]; idx0 = as_uint2(c[0]).s0 & MASK; -#elif(ALGO == cryptonight_monero_v8) +#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data @@ -826,7 +838,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); -#if(ALGO == cryptonight_monero_v8) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) // Use division and square root results from the _previous_ iteration to hide the latency tmp.s0 ^= division_result.s0; tmp.s1 ^= division_result.s1 ^ sqrt_result; @@ -853,6 +865,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); result_mul ^= chunk2; ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); +#if(ALGO == cryptonight_v8_reversewaltz) + { + ulong2 chunk_tmp = chunk3; + chunk3 = chunk1; + chunk1 = chunk_tmp; + } +#endif SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); @@ -882,7 +901,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)a)[0] ^= tmp; -#if (ALGO == cryptonight_monero_v8) +#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) # if defined(__NV_CL_C_VERSION) // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 8878db618..2b34b761c 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -3,7 +3,7 @@ R"===( * @author SChernykh */ -#if(ALGO == cryptonight_monero_v8) +#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) static const __constant uint RCP_C[256] = { diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index dc378e88a..3cfe08b47 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -626,6 +626,16 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } \ + if(ALGO == cryptonight_v8_reversewaltz) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \ @@ -641,10 +651,22 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ + } \ + if(ALGO == cryptonight_v8_reversewaltz) \ + { \ + const uint64_t idx1 = idx0 & MASK; \ + const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \ + hi ^= ((uint64_t*)&chunk2)[0]; \ + lo ^= ((uint64_t*)&chunk2)[1]; \ + const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \ } #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ { \ uint64_t sqrt_result_tmp; \ assign(sqrt_result_tmp, sqrt_result); \ @@ -705,7 +727,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) idx0 = h0[0] ^ h0[4]; \ ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \ bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ { \ bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \ division_result_xmm = _mm_cvtsi64_si128(h0[12]); \ @@ -744,7 +766,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) ptr0 = (__m128i *)&l0[idx0 & MASK]; \ if(PREFETCH) \ _mm_prefetch((const char*)ptr0, _MM_HINT_T0); \ - if(ALGO != cryptonight_monero_v8) \ + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_v8_reversewaltz) \ bx0 = cx #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \ @@ -761,7 +783,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) ah0 += lo; \ al0 += hi; \ } \ - if(ALGO == cryptonight_monero_v8) \ + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \ { \ bx1 = bx0; \ bx0 = cx; \ diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 50507f2ae..b973f5ccf 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -427,6 +427,16 @@ bool minethd::self_test() hashf("This is a test This is a test This is a test", 44, out, ctx, algo); bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0; } + else if(algo == POW(cryptonight_v8_reversewaltz)) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); + bResult = memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0; + + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo); + hashf("This is a test This is a test This is a test", 44, out, ctx, algo); + bResult &= memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0; + } else printer::inst()->print_msg(L0, "Cryptonight hash self-test NOT defined for POW %s", algo.Name().c_str()); @@ -564,6 +574,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_conceal: algv = 13; break; + case cryptonight_v8_reversewaltz: + algv = 14; + break; default: algv = 2; break; @@ -638,7 +651,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; @@ -647,7 +665,6 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc auto selected_function = func_table[ algv << 2 | digit.to_ulong() ]; - // check for asm optimized version for cryptonight_v8 if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes && algo.Mem() == CN_MEMORY && algo.Iter() == CN_ITER) { diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index db0787789..09b014548 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -24,6 +24,7 @@ enum xmrstak_algo_id cryptonight_superfast = 12, cryptonight_gpu = 13, cryptonight_conceal = 14, + cryptonight_v8_reversewaltz = 15, //equal to cryptonight_monero_v8 but with 3/4 iterations and reversed shuffle operation cryptonight_turtle = start_derived_algo_id, cryptonight_v8_half = (start_derived_algo_id + 1), @@ -37,7 +38,7 @@ enum xmrstak_algo_id */ inline std::string get_algo_name(xmrstak_algo_id algo_id) { - static std::array base_algo_names = + static std::array base_algo_names = {{ "invalid_algo", "cryptonight", @@ -53,7 +54,8 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id) "cryptonight_v8", "cryptonight_superfast", "cryptonight_gpu", - "cryptonight_conceal" + "cryptonight_conceal", + "cryptonight_v8_reversewaltz" // used by graft }}; static std::array derived_algo_names = @@ -172,9 +174,11 @@ constexpr uint32_t CN_TURTLE_MASK = 0x1FFF0; constexpr uint32_t CN_ZELERIUS_ITER = 0x60000; +constexpr uint32_t CN_WALTZ_ITER = 0x60000; + inline xmrstak_algo POW(xmrstak_algo_id algo_id) { - static std::array pow = {{ + static std::array pow = {{ {invalid_algo, invalid_algo}, {cryptonight, cryptonight, CN_ITER, CN_MEMORY}, {cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2}, @@ -189,7 +193,8 @@ inline xmrstak_algo POW(xmrstak_algo_id algo_id) {cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY}, {cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY}, {cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK}, - {cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY} + {cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY}, + {cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY} }}; static std::array derived_pow = diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index 184825222..ab4868fc3 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -314,7 +314,7 @@ __global__ void cryptonight_core_gpu_phase2_double( uint64_t bx1; uint32_t sqrt_result; uint64_t division_result; - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub]; bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub]; @@ -370,6 +370,22 @@ __global__ void cryptonight_core_gpu_phase2_double( myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; } + if(ALGO == cryptonight_v8_reversewaltz) + { + + const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ]; + const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ]; +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); +#endif + myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; + myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; + myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + } + myChunks[ idx1 + sub ] = cx_aes ^ bx0; if(MEM_MODE == 0) { @@ -397,14 +413,14 @@ __global__ void cryptonight_core_gpu_phase2_double( else ((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub]; - if(ALGO != cryptonight_monero_v8) + if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_v8_reversewaltz) bx0 = cx_aes; uint64_t cx_mul; ((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0); ((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0); - if((ALGO == cryptonight_monero_v8) && sub == 1) + if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && sub == 1) { // Use division and square root results from the _previous_ iteration to hide the latency ((uint32_t*)&division_result)[1] ^= sqrt_result; @@ -438,6 +454,21 @@ __global__ void cryptonight_core_gpu_phase2_double( __syncwarp(); #else __syncthreads( ); +#endif + myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; + myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; + myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; + } + if(ALGO == cryptonight_v8_reversewaltz) + { + const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ] ^ res; + uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ]; + res ^= ((uint64_t*)&chunk2)[0]; + const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ]; +#if (__CUDACC_VER_MAJOR__ >= 9) + __syncwarp(); +#else + __syncthreads( ); #endif myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1; myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; @@ -445,7 +476,7 @@ __global__ void cryptonight_core_gpu_phase2_double( } ax0 += res; } - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { bx1 = bx0; bx0 = cx_aes; @@ -468,7 +499,7 @@ __global__ void cryptonight_core_gpu_phase2_double( if ( bfactor > 0 ) { ((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0; - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { ((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0; ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1; @@ -814,7 +845,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo for ( int i = 0; i < partcount; i++ ) { - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { // two threads per block CUDA_CHECK_MSG_KERNEL( @@ -1019,7 +1050,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui cryptonight_core_gpu_hash_gpu, cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash, }; std::bitset<1> digit; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 43e21fb42..44eb19264 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -127,7 +127,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a ); XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b ); memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 ); - if(ALGO == cryptonight_monero_v8) + if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) { memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 ); // bx1 @@ -314,7 +314,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) { ctx_b_size += sizeof(uint32_t) * 4 * wsize; } - else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) + else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) + || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end())) { // bx1 (16byte), division_result (8byte) and sqrt_result (8byte) ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize; @@ -376,6 +377,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + else if(miner_algo == cryptonight_v8_reversewaltz) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else { /* pass two times d_ctx_state because the second state is used later in phase1, @@ -743,7 +749,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) } // check if cryptonight_monero_v8 is selected for the user pool - bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()) + || (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()); // overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0 if(useCryptonight_v8 && gpuArch >= 50) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 5dbddb09b..3c85955b3 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -107,8 +107,9 @@ xmrstak::coin_selection coins[] = { { "cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_monero_v8)}, nullptr }, { "cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, { "cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_monero_v8)}, nullptr }, + { "cryptonight_v8_reversewaltz",{POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" }, { "freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_monero_v8)}, nullptr }, - { "graft", {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr }, + { "graft", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" }, { "haven", {POW(cryptonight_haven)}, {POW(cryptonight_heavy)}, nullptr }, { "lethean", {POW(cryptonight_monero)}, {POW(cryptonight_monero_v8)}, nullptr }, { "masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_monero_v8)}, nullptr }, @@ -118,7 +119,8 @@ xmrstak::coin_selection coins[] = { { "stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_monero_v8)}, nullptr }, { "turtlecoin", {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, { "plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, - { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr } + { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr }, + { "reverse_waltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" } }; constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index f8f1d7d6c..8de92347b 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -51,6 +51,7 @@ POOLCONF], * cryptonight_v8 * cryptonight_v8_half (used by masari and stellite) * cryptonight_v8_zelerius + * cryptonight_v8_reversewaltz (used by graft) * # 4MiB scratchpad memory * cryptonight_bittube2 * cryptonight_haven From 4bfc9d39e96f3bfd3b720b35349d8dc141a1cfc6 Mon Sep 17 00:00:00 2001 From: EDDragonWolf Date: Wed, 27 Feb 2019 09:52:19 -0800 Subject: [PATCH 2/4] Fixed format of CryptoNight ReverseWaltz configuration --- README.md | 2 +- xmrstak/jconf.cpp | 7 +++---- xmrstak/pools.tpl | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index c6bbfdf00..a7eee6041 100644 --- a/README.md +++ b/README.md @@ -71,8 +71,8 @@ If your prefered coin is not listed, you can choose one of the following algorit - cryptonight_v7_stellite - cryptonight_v8 - cryptonight_v8_half (used by masari and stellite) - - cryptonight_v8_zelerius - cryptonight_v8_reversewaltz (used by graft) + - cryptonight_v8_zelerius - 4MiB scratchpad memory - cryptonight_haven - cryptonight_heavy diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index 3c85955b3..04c8735ba 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -103,13 +103,13 @@ xmrstak::coin_selection coins[] = { { "cryptonight_v7", {POW(cryptonight_monero)}, {POW(cryptonight_monero_v8)}, nullptr }, { "cryptonight_v8", {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr }, { "cryptonight_v8_half", {POW(cryptonight_v8_half)}, {POW(cryptonight_monero_v8)}, nullptr }, + { "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, nullptr }, { "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_monero_v8)}, nullptr }, { "cryptonight_v7_stellite", {POW(cryptonight_stellite)}, {POW(cryptonight_monero_v8)}, nullptr }, { "cryptonight_gpu", {POW(cryptonight_gpu)}, {POW(cryptonight_gpu)}, "pool.ryo-currency.com:3333" }, { "cryptonight_conceal", {POW(cryptonight_conceal)}, {POW(cryptonight_monero_v8)}, nullptr }, - { "cryptonight_v8_reversewaltz",{POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" }, { "freehaven", {POW(cryptonight_superfast)}, {POW(cryptonight_monero_v8)}, nullptr }, - { "graft", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" }, + { "graft", {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr }, { "haven", {POW(cryptonight_haven)}, {POW(cryptonight_heavy)}, nullptr }, { "lethean", {POW(cryptonight_monero)}, {POW(cryptonight_monero_v8)}, nullptr }, { "masari", {POW(cryptonight_v8_half)}, {POW(cryptonight_monero_v8)}, nullptr }, @@ -119,8 +119,7 @@ xmrstak::coin_selection coins[] = { { "stellite", {POW(cryptonight_v8_half)}, {POW(cryptonight_monero_v8)}, nullptr }, { "turtlecoin", {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr }, { "plenteum", {POW(cryptonight_turtle)}, {POW(cryptonight_turtle)}, nullptr }, - { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr }, - { "reverse_waltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" } + { "zelerius", {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr } }; constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0])); diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index 8de92347b..891fb8d78 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -50,8 +50,8 @@ POOLCONF], * cryptonight_v7 * cryptonight_v8 * cryptonight_v8_half (used by masari and stellite) - * cryptonight_v8_zelerius * cryptonight_v8_reversewaltz (used by graft) + * cryptonight_v8_zelerius * # 4MiB scratchpad memory * cryptonight_bittube2 * cryptonight_haven From c9db12855e25d82436fb0af49dc57e6efb623069 Mon Sep 17 00:00:00 2001 From: EDDragonWolf Date: Thu, 28 Feb 2019 10:17:28 -0800 Subject: [PATCH 3/4] Fixed if-else condition for OpenCL and CUDA implementation --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 4 +--- xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 395101b2b..5229e15b4 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -793,9 +793,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); } -#endif - -#if(ALGO == cryptonight_v8_reversewaltz) +#elif(ALGO == cryptonight_v8_reversewaltz) { ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1)); ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index ab4868fc3..a8447bb69 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -369,8 +369,7 @@ __global__ void cryptonight_core_gpu_phase2_double( myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0; myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0; } - - if(ALGO == cryptonight_v8_reversewaltz) + else if(ALGO == cryptonight_v8_reversewaltz) { const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ]; From d10dafcdae67389d3b5d9b2fc341cb122de8dc13 Mon Sep 17 00:00:00 2001 From: EDDragonWolf Date: Wed, 6 Mar 2019 00:56:34 -0800 Subject: [PATCH 4/4] Fixed AMD support of CryptoNight v8 ReverseWaltz --- xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 5229e15b4..b78f2bcf7 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -30,7 +30,7 @@ R"===( #define cryptonight_superfast 12 #define cryptonight_gpu 13 #define cryptonight_conceal 14 -#define cryptonight_v8_reversewaltz 15 +#define cryptonight_v8_reversewaltz 17 /* For Mesa clover support */ #ifdef cl_clang_storage_class_specifiers @@ -864,14 +864,12 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states result_mul ^= chunk2; ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); #if(ALGO == cryptonight_v8_reversewaltz) - { - ulong2 chunk_tmp = chunk3; - chunk3 = chunk1; - chunk1 = chunk_tmp; - } -#endif + SCRATCHPAD_CHUNK(1) = as_uint4(chunk1 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk3 + ((ulong2 *)b_x)[0]); +#else SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); +#endif SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); a[0] += result_mul.s0; a[1] += result_mul.s1;