From 4457b607000635c2059d85f85530e65d4f333383 Mon Sep 17 00:00:00 2001
From: EDDragonWolf <eddragonwolf@ukr.net>
Date: Fri, 22 Feb 2019 10:49:43 -0800
Subject: [PATCH 1/4] Added support of CryptoNight v8 ReverseWaltz (aka
 cryptonight_v8_reversewaltz, reverse_waltz or graft)

---
 README.md                                     |  1 +
 xmrstak/backend/amd/amd_gpu/gpu.cpp           |  2 +-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 37 ++++++++++----
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    |  2 +-
 .../backend/cpu/crypto/cryptonight_aesni.h    | 30 ++++++++++--
 xmrstak/backend/cpu/minethd.cpp               | 21 +++++++-
 xmrstak/backend/cryptonight.hpp               | 13 +++--
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 48 ++++++++++++++++---
 .../backend/nvidia/nvcc_code/cuda_extra.cu    | 13 +++--
 xmrstak/jconf.cpp                             |  6 ++-
 xmrstak/pools.tpl                             |  1 +
 11 files changed, 141 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index ff87dcead..c6bbfdf00 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_v8
     - cryptonight_v8_half (used by masari and stellite)
     - cryptonight_v8_zelerius
+    - cryptonight_v8_reversewaltz (used by graft)
 - 4MiB scratchpad memory
     - cryptonight_haven
     - cryptonight_heavy
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 8713784c2..a47c1015f 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -430,7 +430,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 * this is required if the dev pool is mining monero
 		 * but the user tuned there settings for another currency
 		 */
-		if(miner_algo == cryptonight_monero_v8)
+		if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_v8_reversewaltz)
 		{
 			if(ctx->memChunk < 2)
 				mem_chunk_exp = 1u << 2;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 2ca09c31c..395101b2b 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -30,6 +30,7 @@ R"===(
 #define cryptonight_superfast 12
 #define cryptonight_gpu 13
 #define cryptonight_conceal 14
+#define cryptonight_v8_reversewaltz 15
 
 /* For Mesa clover support */
 #ifdef cl_clang_storage_class_specifiers
@@ -639,7 +640,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 R"===(
 
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
-#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -659,7 +660,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	float4 conc_var = (float4)(0.0f);
 #endif
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -673,7 +674,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 	__local uint AES0[256], AES1[256];
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
 	__local uint RCP[256];
 #	endif
@@ -689,7 +690,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		AES0[i] = tmp;
 		AES1[i] = rotate(tmp, 8U);
 
-#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
 	}
@@ -723,7 +724,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 		b_x[0] = ((uint4 *)b)[0];
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		a[1] = states[1] ^ states[5];
 		b[2] = states[8] ^ states[10];
 		b[3] = states[9] ^ states[11];
@@ -755,7 +756,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	{
 			ulong c[2];
 
-#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
@@ -794,6 +795,17 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		}
 #endif
 
+#if(ALGO == cryptonight_v8_reversewaltz)
+		{
+			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1));
+			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(3));
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+		}
+#endif
+
 #if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
 			uint table = 0x75310U;
 			b_x[0] ^= ((uint4 *)c)[0];
@@ -807,7 +819,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
 
-#elif(ALGO == cryptonight_monero_v8)
+#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -826,7 +838,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			tmp.s0 ^= division_result.s0;
 			tmp.s1 ^= division_result.s1 ^ sqrt_result;
@@ -853,6 +865,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
 			result_mul ^= chunk2;
 			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+#if(ALGO == cryptonight_v8_reversewaltz)
+			{
+				ulong2 chunk_tmp = chunk3;
+				chunk3 = chunk1;
+				chunk1 = chunk_tmp;
+			}
+#endif
 			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
 			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
@@ -882,7 +901,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 		((uint4 *)a)[0] ^= tmp;
 
-#if (ALGO == cryptonight_monero_v8)
+#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index 8878db618..2b34b761c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -3,7 +3,7 @@ R"===(
  * @author SChernykh
  */
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 
 static const __constant uint RCP_C[256] =
 {
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index dc378e88a..3cfe08b47 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -626,6 +626,16 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	} \
+	if(ALGO == cryptonight_v8_reversewaltz) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk3 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
 	}
 
 #define CN_MONERO_V8_SHUFFLE_1(n, l0, idx0, ax0, bx0, bx1, lo, hi) \
@@ -641,10 +651,22 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
 		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
+	} \
+	if(ALGO == cryptonight_v8_reversewaltz) \
+	{ \
+		const uint64_t idx1 = idx0 & MASK; \
+		const __m128i chunk3 = _mm_xor_si128(_mm_load_si128((__m128i *)&l0[idx1 ^ 0x10]), _mm_set_epi64x(lo, hi)); \
+		const __m128i chunk2 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x20]); \
+		hi ^= ((uint64_t*)&chunk2)[0]; \
+		lo ^= ((uint64_t*)&chunk2)[1]; \
+		const __m128i chunk1 = _mm_load_si128((__m128i *)&l0[idx1 ^ 0x30]); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x10], _mm_add_epi64(chunk3, bx1)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x20], _mm_add_epi64(chunk1, bx0)); \
+		_mm_store_si128((__m128i *)&l0[idx1 ^ 0x30], _mm_add_epi64(chunk2, ax0)); \
 	}
 
 #define CN_MONERO_V8_DIV(n, cx, sqrt_result, division_result_xmm, cl) \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
 	{ \
 		uint64_t sqrt_result_tmp; \
 		assign(sqrt_result_tmp, sqrt_result); \
@@ -705,7 +727,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 		idx0 = h0[0] ^ h0[4]; \
 		ax0 = _mm_set_epi64x(h0[1] ^ h0[5], idx0); \
 		bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); \
-		if(ALGO == cryptonight_monero_v8) \
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
 		{ \
 			bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); \
 			division_result_xmm = _mm_cvtsi64_si128(h0[12]); \
@@ -744,7 +766,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 	ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 	if(PREFETCH) \
 		_mm_prefetch((const char*)ptr0, _MM_HINT_T0); \
-	if(ALGO != cryptonight_monero_v8) \
+	if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_v8_reversewaltz) \
 		bx0 = cx
 
 #define CN_STEP3(n, monero_const, l0, ax0, bx0, idx0, ptr0, lo, cl, ch, al0, ah0, cx, bx1, sqrt_result, division_result_xmm) \
@@ -761,7 +783,7 @@ inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
 		ah0 += lo; \
 		al0 += hi; \
 	} \
-	if(ALGO == cryptonight_monero_v8) \
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) \
 	{ \
 		bx1 = bx0; \
 		bx0 = cx; \
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 50507f2ae..b973f5ccf 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -427,6 +427,16 @@ bool minethd::self_test()
 			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
 			bResult = bResult && memcmp(out, "\x30\x5f\x66\xfe\xbb\xf3\x60\x0e\xda\xbb\x60\xf7\xf1\xc9\xb9\x0a\x3a\xe8\x5a\x31\xd4\x76\xca\x38\x1d\x56\x18\xa6\xc6\x27\x60\xd7", 32) == 0;
 		}
+		else if(algo == POW(cryptonight_v8_reversewaltz))
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
+			bResult = memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0;
+
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), true, algo);
+			hashf("This is a test This is a test This is a test", 44, out, ctx, algo);
+			bResult &= memcmp(out, "\x32\xf7\x36\xec\x1d\x2f\x3f\xc5\x4c\x49\xbe\xb8\xa0\x47\x6c\xbf\xdd\x14\xc3\x51\xb9\xc6\xd7\x2c\x6f\x9f\xfc\xb5\x87\x5b\xe6\xb3", 32) == 0;
+		}
 		else
 			printer::inst()->print_msg(L0,
 				"Cryptonight hash self-test NOT defined for POW %s", algo.Name().c_str());
@@ -564,6 +574,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_conceal:
 		algv = 13;
 		break;
+	case cryptonight_v8_reversewaltz:
+		algv = 14;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -638,7 +651,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_conceal, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_conceal, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_conceal, true, true>,
+
+		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_v8_reversewaltz, true, true>
 	};
 
 	std::bitset<2> digit;
@@ -647,7 +665,6 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 
 	auto selected_function = func_table[ algv << 2 | digit.to_ulong() ];
 
-
 	// check for asm optimized version for cryptonight_v8
 	if(N <= 2 && algo == cryptonight_monero_v8 && bHaveAes && algo.Mem() == CN_MEMORY && algo.Iter() == CN_ITER)
 	{
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index db0787789..09b014548 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -24,6 +24,7 @@ enum xmrstak_algo_id
 	cryptonight_superfast = 12,
 	cryptonight_gpu = 13,
 	cryptonight_conceal = 14,
+	cryptonight_v8_reversewaltz = 15, //equal to cryptonight_monero_v8 but with 3/4 iterations and reversed shuffle operation
 
 	cryptonight_turtle = start_derived_algo_id,
 	cryptonight_v8_half = (start_derived_algo_id + 1),
@@ -37,7 +38,7 @@ enum xmrstak_algo_id
  */
 inline std::string get_algo_name(xmrstak_algo_id algo_id)
 {
-	static std::array<std::string, 15> base_algo_names =
+	static std::array<std::string, 16> base_algo_names =
 	{{
 		"invalid_algo",
 		"cryptonight",
@@ -53,7 +54,8 @@ inline std::string get_algo_name(xmrstak_algo_id algo_id)
 		"cryptonight_v8",
 		"cryptonight_superfast",
 		"cryptonight_gpu",
-		"cryptonight_conceal"
+		"cryptonight_conceal",
+		"cryptonight_v8_reversewaltz" // used by graft
 	}};
 
 	static std::array<std::string, 3> derived_algo_names =
@@ -172,9 +174,11 @@ constexpr uint32_t CN_TURTLE_MASK = 0x1FFF0;
 
 constexpr uint32_t CN_ZELERIUS_ITER = 0x60000;
 
+constexpr uint32_t CN_WALTZ_ITER = 0x60000;
+
 inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	static std::array<xmrstak_algo, 15> pow = {{
+	static std::array<xmrstak_algo, 16> pow = {{
 		{invalid_algo, invalid_algo},
 		{cryptonight, cryptonight, CN_ITER, CN_MEMORY},
 		{cryptonight_lite, cryptonight_lite, CN_ITER/2, CN_MEMORY/2},
@@ -189,7 +193,8 @@ inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 		{cryptonight_monero_v8, cryptonight_monero_v8, CN_ITER, CN_MEMORY},
 		{cryptonight_superfast, cryptonight_superfast, CN_ITER/4, CN_MEMORY},
 		{cryptonight_gpu, cryptonight_gpu, CN_GPU_ITER, CN_MEMORY, CN_GPU_MASK},
-		{cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY}
+		{cryptonight_conceal, cryptonight_conceal, CN_ITER/2, CN_MEMORY},
+		{cryptonight_v8_reversewaltz, cryptonight_v8_reversewaltz, CN_WALTZ_ITER, CN_MEMORY}
 	}};
 
 	static std::array<xmrstak_algo, 3> derived_pow =
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index 184825222..ab4868fc3 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -314,7 +314,7 @@ __global__ void cryptonight_core_gpu_phase2_double(
 	uint64_t bx1;
 	uint32_t sqrt_result;
 	uint64_t division_result;
-	if(ALGO == cryptonight_monero_v8)
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 	{
 		bx0 = ((uint64_t*)(d_ctx_b + thread * 12))[sub];
 		bx1 = ((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub];
@@ -370,6 +370,22 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
 		}
 
+		if(ALGO == cryptonight_v8_reversewaltz)
+		{
+
+			const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ];
+			const uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+			const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+			__syncwarp();
+#else
+			__syncthreads( );
+#endif
+			myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+		}
+
 		myChunks[ idx1 + sub ] = cx_aes ^ bx0;
 		if(MEM_MODE == 0)
 		{
@@ -397,14 +413,14 @@ __global__ void cryptonight_core_gpu_phase2_double(
 		else
 			((ulonglong4*)myChunks)[sub] = ((ulonglong4*)ptr0)[sub];
 
-		if(ALGO != cryptonight_monero_v8)
+		if(ALGO != cryptonight_monero_v8 && ALGO != cryptonight_v8_reversewaltz)
 			bx0 = cx_aes;
 
 		uint64_t cx_mul;
 		((uint32_t*)&cx_mul)[0] = shuffle<2>(sPtr, sub, cx_aes.x , 0);
 		((uint32_t*)&cx_mul)[1] = shuffle<2>(sPtr, sub, cx_aes.y , 0);
 
-		if((ALGO == cryptonight_monero_v8) && sub == 1)
+		if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && sub == 1)
 		{
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			((uint32_t*)&division_result)[1] ^= sqrt_result;
@@ -438,6 +454,21 @@ __global__ void cryptonight_core_gpu_phase2_double(
 				__syncwarp();
 #else
 				__syncthreads( );
+#endif
+				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
+				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
+				myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
+			}
+			if(ALGO == cryptonight_v8_reversewaltz)
+			{
+				const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ] ^ res;
+				uint64_t chunk2 = myChunks[ idx1 ^ 4 + sub ];
+				res ^= ((uint64_t*)&chunk2)[0];
+				const uint64_t chunk1 = myChunks[ idx1 ^ 6 + sub ];
+#if (__CUDACC_VER_MAJOR__ >= 9)
+				__syncwarp();
+#else
+				__syncthreads( );
 #endif
 				myChunks[ idx1 ^ 2 + sub ] = chunk3 + bx1;
 				myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
@@ -445,7 +476,7 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			}
 			ax0 += res;
 		}
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		{
 			bx1 = bx0;
 			bx0 = cx_aes;
@@ -468,7 +499,7 @@ __global__ void cryptonight_core_gpu_phase2_double(
 	if ( bfactor > 0 )
 	{
 		((uint64_t*)(d_ctx_a + thread * 4))[sub] = ax0;
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		{
 			((uint64_t*)(d_ctx_b + thread * 12))[sub] = bx0;
 			((uint64_t*)(d_ctx_b + thread * 12 + 4))[sub] = bx1;
@@ -814,7 +845,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce, const xmrstak_algo
 
 	for ( int i = 0; i < partcount; i++ )
 	{
-		if(ALGO == cryptonight_monero_v8)
+		if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		{
 			// two threads per block
 			CUDA_CHECK_MSG_KERNEL(
@@ -1019,7 +1050,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, const xmrstak_algo& miner_algo, ui
 		cryptonight_core_gpu_hash_gpu<cryptonight_gpu, 1>,
 
 		cryptonight_core_gpu_hash<cryptonight_conceal, 0>,
-		cryptonight_core_gpu_hash<cryptonight_conceal, 1>
+		cryptonight_core_gpu_hash<cryptonight_conceal, 1>,
+
+		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 0>,
+		cryptonight_core_gpu_hash<cryptonight_v8_reversewaltz, 1>,
 	};
 
 	std::bitset<1> digit;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 43e21fb42..44eb19264 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -127,7 +127,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	XOR_BLOCKS_DST( ctx_state, ctx_state + 8, ctx_a );
 	XOR_BLOCKS_DST( ctx_state + 4, ctx_state + 12, ctx_b );
 	memcpy( d_ctx_a + thread * 4, ctx_a, 4 * 4 );
-	if(ALGO == cryptonight_monero_v8)
+	if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 	{
 		memcpy( d_ctx_b + thread * 12, ctx_b, 4 * 4 );
 		// bx1
@@ -314,7 +314,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	{
 		ctx_b_size += sizeof(uint32_t) * 4 * wsize;
 	}
-	else if(std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
+	else if((std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
+		|| (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end()))
 	{
 		// bx1 (16byte), division_result (8byte) and sqrt_result (8byte)
 		ctx_b_size = 3 * 4 * sizeof(uint32_t) * wsize;
@@ -376,6 +377,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_gpu><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	else if(miner_algo == cryptonight_v8_reversewaltz)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_v8_reversewaltz><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else
 	{
 		/* pass two times d_ctx_state because the second state is used later in phase1,
@@ -743,7 +749,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		}
 
 		// check if cryptonight_monero_v8 is selected for the user pool
-		bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
+		bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end())
+			|| (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_v8_reversewaltz) != neededAlgorithms.end());
 
 		// overwrite default config if cryptonight_monero_v8 is mined and GPU has at least compute capability 5.0
 		if(useCryptonight_v8 && gpuArch >= 50)
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 5dbddb09b..3c85955b3 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -107,8 +107,9 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_v7_stellite", {POW(cryptonight_stellite)},  {POW(cryptonight_monero_v8)}, nullptr },
 	{ "cryptonight_gpu",         {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)},       "pool.ryo-currency.com:3333" },
 	{ "cryptonight_conceal",     {POW(cryptonight_conceal)},   {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8_reversewaltz",{POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" },
 	{ "freehaven",               {POW(cryptonight_superfast)}, {POW(cryptonight_monero_v8)}, nullptr },
-	{ "graft",                   {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr },
+	{ "graft",                   {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" },
 	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_heavy)},     nullptr },
 	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_monero_v8)}, nullptr },
 	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
@@ -118,7 +119,8 @@ xmrstak::coin_selection coins[] = {
 	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
 	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
 	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_monero_v8)}, nullptr }
+	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_monero_v8)}, nullptr },
+	{ "reverse_waltz",           {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index f8f1d7d6c..8de92347b 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -51,6 +51,7 @@ POOLCONF],
  *    cryptonight_v8
  *    cryptonight_v8_half (used by masari and stellite)
  *    cryptonight_v8_zelerius
+ *    cryptonight_v8_reversewaltz (used by graft)
  *    # 4MiB scratchpad memory
  *    cryptonight_bittube2
  *    cryptonight_haven

From 4bfc9d39e96f3bfd3b720b35349d8dc141a1cfc6 Mon Sep 17 00:00:00 2001
From: EDDragonWolf <eddragonwolf@ukr.net>
Date: Wed, 27 Feb 2019 09:52:19 -0800
Subject: [PATCH 2/4] Fixed format of CryptoNight ReverseWaltz configuration

---
 README.md         | 2 +-
 xmrstak/jconf.cpp | 7 +++----
 xmrstak/pools.tpl | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c6bbfdf00..a7eee6041 100644
--- a/README.md
+++ b/README.md
@@ -71,8 +71,8 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_v7_stellite
     - cryptonight_v8
     - cryptonight_v8_half (used by masari and stellite)
-    - cryptonight_v8_zelerius
     - cryptonight_v8_reversewaltz (used by graft)
+    - cryptonight_v8_zelerius
 - 4MiB scratchpad memory
     - cryptonight_haven
     - cryptonight_heavy
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index 3c85955b3..04c8735ba 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -103,13 +103,13 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_v7",          {POW(cryptonight_monero)},    {POW(cryptonight_monero_v8)}, nullptr },
 	{ "cryptonight_v8",          {POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr },
 	{ "cryptonight_v8_half",     {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
+	{ "cryptonight_v8_reversewaltz", {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, nullptr },
 	{ "cryptonight_v8_zelerius", {POW(cryptonight_v8_zelerius)},{POW(cryptonight_monero_v8)}, nullptr },
 	{ "cryptonight_v7_stellite", {POW(cryptonight_stellite)},  {POW(cryptonight_monero_v8)}, nullptr },
 	{ "cryptonight_gpu",         {POW(cryptonight_gpu)},       {POW(cryptonight_gpu)},       "pool.ryo-currency.com:3333" },
 	{ "cryptonight_conceal",     {POW(cryptonight_conceal)},   {POW(cryptonight_monero_v8)}, nullptr },
-	{ "cryptonight_v8_reversewaltz",{POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" },
 	{ "freehaven",               {POW(cryptonight_superfast)}, {POW(cryptonight_monero_v8)}, nullptr },
-	{ "graft",                   {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" },
+	{ "graft",                   {POW(cryptonight_v8_reversewaltz), 12, POW(cryptonight_monero_v8)}, {POW(cryptonight_monero_v8)}, nullptr },
 	{ "haven",                   {POW(cryptonight_haven)},     {POW(cryptonight_heavy)},     nullptr },
 	{ "lethean",                 {POW(cryptonight_monero)},    {POW(cryptonight_monero_v8)}, nullptr },
 	{ "masari",                  {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
@@ -119,8 +119,7 @@ xmrstak::coin_selection coins[] = {
 	{ "stellite",                {POW(cryptonight_v8_half)},   {POW(cryptonight_monero_v8)}, nullptr },
 	{ "turtlecoin",              {POW(cryptonight_turtle), 6u,POW(cryptonight_aeon)}, {POW(cryptonight_aeon)}, nullptr },
 	{ "plenteum",			     {POW(cryptonight_turtle)},    {POW(cryptonight_turtle)},    nullptr },
-	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_monero_v8)}, nullptr },
-	{ "reverse_waltz",           {POW(cryptonight_v8_reversewaltz)}, {POW(cryptonight_monero_v8)}, "mining.grftpool.com:3333" }
+	{ "zelerius",                {POW(cryptonight_v8_zelerius), 7, POW(cryptonight_monero_v8)},   {POW(cryptonight_monero_v8)}, nullptr }
 };
 
 constexpr size_t coin_algo_size = (sizeof(coins)/sizeof(coins[0]));
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index 8de92347b..891fb8d78 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -50,8 +50,8 @@ POOLCONF],
  *    cryptonight_v7
  *    cryptonight_v8
  *    cryptonight_v8_half (used by masari and stellite)
- *    cryptonight_v8_zelerius
  *    cryptonight_v8_reversewaltz (used by graft)
+ *    cryptonight_v8_zelerius
  *    # 4MiB scratchpad memory
  *    cryptonight_bittube2
  *    cryptonight_haven

From c9db12855e25d82436fb0af49dc57e6efb623069 Mon Sep 17 00:00:00 2001
From: EDDragonWolf <eddragonwolf@ukr.net>
Date: Thu, 28 Feb 2019 10:17:28 -0800
Subject: [PATCH 3/4] Fixed if-else condition for OpenCL and CUDA
 implementation

---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 4 +---
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu     | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 395101b2b..5229e15b4 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -793,9 +793,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
 			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 		}
-#endif
-
-#if(ALGO == cryptonight_v8_reversewaltz)
+#elif(ALGO == cryptonight_v8_reversewaltz)
 		{
 			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1));
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index ab4868fc3..a8447bb69 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -369,8 +369,7 @@ __global__ void cryptonight_core_gpu_phase2_double(
 			myChunks[ idx1 ^ 4 + sub ] = chunk1 + bx0;
 			myChunks[ idx1 ^ 6 + sub ] = chunk2 + ax0;
 		}
-
-		if(ALGO == cryptonight_v8_reversewaltz)
+		else if(ALGO == cryptonight_v8_reversewaltz)
 		{
 
 			const uint64_t chunk3 = myChunks[ idx1 ^ 2 + sub ];

From d10dafcdae67389d3b5d9b2fc341cb122de8dc13 Mon Sep 17 00:00:00 2001
From: EDDragonWolf <eddragonwolf@ukr.net>
Date: Wed, 6 Mar 2019 00:56:34 -0800
Subject: [PATCH 4/4] Fixed AMD support of CryptoNight v8 ReverseWaltz

---
 xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 5229e15b4..b78f2bcf7 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -30,7 +30,7 @@ R"===(
 #define cryptonight_superfast 12
 #define cryptonight_gpu 13
 #define cryptonight_conceal 14
-#define cryptonight_v8_reversewaltz 15
+#define cryptonight_v8_reversewaltz 17
 
 /* For Mesa clover support */
 #ifdef cl_clang_storage_class_specifiers
@@ -864,14 +864,12 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			result_mul ^= chunk2;
 			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
 #if(ALGO == cryptonight_v8_reversewaltz)
-			{
-				ulong2 chunk_tmp = chunk3;
-				chunk3 = chunk1;
-				chunk1 = chunk_tmp;
-			}
-#endif
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk1 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk3 + ((ulong2 *)b_x)[0]);
+#else
 			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+#endif
 			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 			a[0] += result_mul.s0;
 			a[1] += result_mul.s1;