Merge pull request #203 from fancyIX/feature/#201

Issue #201 [yescrypt]
fancyIX · Nov 14, 2020 · 91e924e · 91e924e
2 parents 3acc616 + b3ecb4c
commit 91e924e
Show file tree

Hide file tree

Showing 10 changed files with 1,033 additions and 9 deletions.
diff --git a/algorithm.c b/algorithm.c
@@ -445,6 +445,76 @@ static cl_int queue_yescrypt_multikernel(_clState *clState, dev_blk_ctx *blk, __
   CL_SET_ARG(le_target);
 
   return status;
+}
+
+static cl_int queue_yescrypt_navikernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_uint le_target;
+  cl_int status = 0;
+
+  le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
+  flip80(clState->cldata, blk->work->data);
+  uint32_t h[8], data[32];
+
+	h[0] = 0x6A09E667; h[1] = 0xBB67AE85; h[2] = 0x3C6EF372; h[3] = 0xA54FF53A;
+	h[4] = 0x510E527F; h[5] = 0x9B05688C; h[6] = 0x1F83D9AB; h[7] = 0x5BE0CD19;
+	data[0] = ((uint32_t*)(clState->cldata))[0]; data[1] = ((uint32_t*)(clState->cldata))[1];
+	data[2] = ((uint32_t*)(clState->cldata))[2]; data[3] = ((uint32_t*)(clState->cldata))[3];
+	data[4] = ((uint32_t*)(clState->cldata))[4]; data[5] = ((uint32_t*)(clState->cldata))[5];
+	data[6] = ((uint32_t*)(clState->cldata))[6]; data[7] = ((uint32_t*)(clState->cldata))[7];
+	data[8] = ((uint32_t*)(clState->cldata))[8]; data[9] = ((uint32_t*)(clState->cldata))[9];
+	data[10] = ((uint32_t*)(clState->cldata))[10]; data[11] = ((uint32_t*)(clState->cldata))[11];
+	data[12] = ((uint32_t*)(clState->cldata))[12]; data[13] = ((uint32_t*)(clState->cldata))[13];
+	data[14] = ((uint32_t*)(clState->cldata))[14]; data[15] = ((uint32_t*)(clState->cldata))[15];
+	for (int i = 0; i<20; i++) { data[i] = htobe32(data[i]); }
+	sha256_round_body_host(data, h);
+
+size_t worksize = blk->work->thr->cgpu->thread_concurrency;
+threads = worksize;
+
+    status=clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, CL_TRUE, 0, 32, h, 0, NULL, NULL);
+    status=clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, CL_TRUE, 0, 80, clState->cldata, 0, NULL, NULL);
+	if (status != CL_SUCCESS) {
+      printf("status %d: clEnqueueWriteBuffer\n", status);
+      exit(1);
+	}
+
+    clSetKernelArg(clState->yescrypt_gpu_hash_k0, 0, sizeof(clState->padbuffer8), &(clState->padbuffer8));
+    clSetKernelArg(clState->yescrypt_gpu_hash_k0, 1, sizeof(clState->CLbuffer0), &(clState->CLbuffer0));
+	clSetKernelArg(clState->yescrypt_gpu_hash_k0, 2, sizeof(clState->buffer1), &(clState->buffer1));
+	clSetKernelArg(clState->yescrypt_gpu_hash_k0, 3, sizeof(clState->buffer4), &(clState->buffer4));
+	clSetKernelArg(clState->yescrypt_gpu_hash_k0, 4, sizeof(threads), &threads);
+
+		clSetKernelArg(clState->yescrypt_gpu_hash_k1, 0, sizeof(clState->buffer1), &(clState->buffer1));
+	    clSetKernelArg(clState->yescrypt_gpu_hash_k1, 1, sizeof(clState->buffer2), &(clState->buffer2));
+
+		clSetKernelArg(clState->yescrypt_gpu_hash_k1, 3, sizeof(threads), &threads);
+
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 0, sizeof(clState->buffer1), &(clState->buffer1));
+	        clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 1, sizeof(clState->buffer2), &(clState->buffer2));
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 2, sizeof(clState->buffer3), &(clState->buffer3));
+
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 5, sizeof(threads), &threads);
+
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 0, sizeof(clState->buffer1), &(clState->buffer1));
+	        clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 1, sizeof(clState->buffer2), &(clState->buffer2));
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 2, sizeof(clState->buffer3), &(clState->buffer3));
+
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 5, sizeof(threads), &threads);
+
+    clSetKernelArg(clState->yescrypt_gpu_hash_k5, 0, sizeof(clState->CLbuffer0), &(clState->CLbuffer0));
+	clSetKernelArg(clState->yescrypt_gpu_hash_k5, 1, sizeof(clState->buffer1), &(clState->buffer1));
+	clSetKernelArg(clState->yescrypt_gpu_hash_k5, 2, sizeof(clState->buffer4), &(clState->buffer4));
+	clSetKernelArg(clState->yescrypt_gpu_hash_k5, 3, sizeof(clState->outputBuffer), &(clState->outputBuffer));
+
+	clSetKernelArg(clState->yescrypt_gpu_hash_k5, 5, sizeof(threads), &threads);
+  clSetKernelArg(clState->yescrypt_gpu_hash_k5, 4, sizeof(uint32_t), &le_target);
+
+
+
+
+
+
 }
 
 static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
@@ -2294,6 +2364,11 @@ static algorithm_settings_t algos[] = {
   A_YESCRYPT_MULTI("yescrypt-multi"),
 #undef A_YESCRYPT_MULTI
 
+#define A_YESCRYPT_NAVI(a) \
+  { a, ALGO_YESCRYPT_NAVI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, NULL, NULL, queue_yescrypt_navikernel, gen_hash, append_neoscrypt_compiler_options}
+  A_YESCRYPT_NAVI("yescryptf"),
+#undef A_YESCRYPT_NAVI
+
   // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
 #define A_QUARK(a, b) \
   { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }

diff --git a/algorithm.h b/algorithm.h
@@ -43,6 +43,7 @@ typedef enum {
   ALGO_PLUCK,
   ALGO_YESCRYPT,
   ALGO_YESCRYPT_MULTI,
+  ALGO_YESCRYPT_NAVI,
   ALGO_BLAKECOIN,
   ALGO_BLAKE,
   ALGO_SIA,

diff --git a/algorithm/yescrypt.c b/algorithm/yescrypt.c
@@ -116,4 +116,122 @@ bool scanhash_yescrypt(struct thr_info *thr, const unsigned char __maybe_unused
 	}
 
 	return ret;
-}
+}
+
+#define SPH_T32(x) (x)
+#define ROTR(x, n)      ((x >> n) | (x << (32 - n)))
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t *pd,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t *ph,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1, t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = *ph + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	*pd = *pd + t1;
+	*ph = t1 + t2;
+}
+
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t *pd,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t *ph,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1, t2;
+
+	int pcidx1 = (pc - 2) & 0xF;
+	int pcidx2 = (pc - 7) & 0xF;
+	int pcidx3 = (pc - 15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = *ph + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	*pd = *pd + t1;
+	*ph = t1 + t2;
+}
+
+void sha256_round_body_host(uint32_t* in, uint32_t* state)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a, b, c, &d, e, f, g, &h, in[0], cpu_K[0]);
+	sha256_step1_host(h, a, b, &c, d, e, f, &g, in[1], cpu_K[1]);
+	sha256_step1_host(g, h, a, &b, c, d, e, &f, in[2], cpu_K[2]);
+	sha256_step1_host(f, g, h, &a, b, c, d, &e, in[3], cpu_K[3]);
+	sha256_step1_host(e, f, g, &h, a, b, c, &d, in[4], cpu_K[4]);
+	sha256_step1_host(d, e, f, &g, h, a, b, &c, in[5], cpu_K[5]);
+	sha256_step1_host(c, d, e, &f, g, h, a, &b, in[6], cpu_K[6]);
+	sha256_step1_host(b, c, d, &e, f, g, h, &a, in[7], cpu_K[7]);
+	sha256_step1_host(a, b, c, &d, e, f, g, &h, in[8], cpu_K[8]);
+	sha256_step1_host(h, a, b, &c, d, e, f, &g, in[9], cpu_K[9]);
+	sha256_step1_host(g, h, a, &b, c, d, e, &f, in[10], cpu_K[10]);
+	sha256_step1_host(f, g, h, &a, b, c, d, &e, in[11], cpu_K[11]);
+	sha256_step1_host(e, f, g, &h, a, b, c, &d, in[12], cpu_K[12]);
+	sha256_step1_host(d, e, f, &g, h, a, b, &c, in[13], cpu_K[13]);
+	sha256_step1_host(c, d, e, &f, g, h, a, &b, in[14], cpu_K[14]);
+	sha256_step1_host(b, c, d, &e, f, g, h, &a, in[15], cpu_K[15]);
+
+	for (int i = 0; i < 3; i++)
+	{
+		sha256_step2_host(a, b, c, &d, e, f, g, &h, in, 0, cpu_K[16 + 16 * i]);
+		sha256_step2_host(h, a, b, &c, d, e, f, &g, in, 1, cpu_K[17 + 16 * i]);
+		sha256_step2_host(g, h, a, &b, c, d, e, &f, in, 2, cpu_K[18 + 16 * i]);
+		sha256_step2_host(f, g, h, &a, b, c, d, &e, in, 3, cpu_K[19 + 16 * i]);
+		sha256_step2_host(e, f, g, &h, a, b, c, &d, in, 4, cpu_K[20 + 16 * i]);
+		sha256_step2_host(d, e, f, &g, h, a, b, &c, in, 5, cpu_K[21 + 16 * i]);
+		sha256_step2_host(c, d, e, &f, g, h, a, &b, in, 6, cpu_K[22 + 16 * i]);
+		sha256_step2_host(b, c, d, &e, f, g, h, &a, in, 7, cpu_K[23 + 16 * i]);
+		sha256_step2_host(a, b, c, &d, e, f, g, &h, in, 8, cpu_K[24 + 16 * i]);
+		sha256_step2_host(h, a, b, &c, d, e, f, &g, in, 9, cpu_K[25 + 16 * i]);
+		sha256_step2_host(g, h, a, &b, c, d, e, &f, in, 10, cpu_K[26 + 16 * i]);
+		sha256_step2_host(f, g, h, &a, b, c, d, &e, in, 11, cpu_K[27 + 16 * i]);
+		sha256_step2_host(e, f, g, &h, a, b, c, &d, in, 12, cpu_K[28 + 16 * i]);
+		sha256_step2_host(d, e, f, &g, h, a, b, &c, in, 13, cpu_K[29 + 16 * i]);
+		sha256_step2_host(c, d, e, &f, g, h, a, &b, in, 14, cpu_K[30 + 16 * i]);
+		sha256_step2_host(b, c, d, &e, f, g, h, &a, in, 15, cpu_K[31 + 16 * i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
diff --git a/algorithm/yescrypt.h b/algorithm/yescrypt.h
@@ -2,9 +2,12 @@
 #define YESCRYPT_H
 
 #include "miner.h"
+#define YESCRYPT_NAVI_SCRATCHBUF_SIZE (2 * 2048 * 8 * 4 + 2 * 16 * 8 * 4 + 512 * 4 + 8 * 4 + 32) //uchar
 #define YESCRYPT_SCRATCHBUF_SIZE (128 * 2048 * 8 ) //uchar
 #define YESCRYP_SECBUF_SIZE (128*64*8)
 extern int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
 extern void yescrypt_regenhash(struct work *work);
 
+void sha256_round_body_host(uint32_t* in, uint32_t* state);
+
 #endif /* YESCRYPT_H */
diff --git a/driver-opencl.c b/driver-opencl.c
@@ -1432,7 +1432,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
   if (clState->goffset)
     p_global_work_offset = (size_t *)&work->blk.nonce;
 
-if (gpu->algorithm.type != ALGO_MTP) {
+if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI) {
   if (gpu->algorithm.type == ALGO_ARGON2D) {
     const uint32_t throughput = gpu->throughput;
 	  const size_t global[] = { 16, throughput };
@@ -1578,6 +1578,40 @@ if (gpu->algorithm.type != ALGO_MTP) {
       return -1;
     }
   }
+}
+if (gpu->algorithm.type == ALGO_YESCRYPT_NAVI) {
+  cl_uint threads = globalThreads[0];
+  size_t worksize = globalThreads[0];
+
+  	uint32_t tpb = 32U;
+	size_t grid[1] = {worksize};
+	size_t grid2[2] = {worksize / tpb * 4U, tpb >> 2};
+	size_t grid3[2] = {worksize / tpb * 16U, tpb >> 4};
+	size_t block[1] = {tpb};
+	size_t block2[2] = {4U, tpb >> 2};
+	size_t block3[2] = {16U, tpb >> 4};
+
+status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k0, 1, NULL, grid, block, 0, NULL, NULL);
+
+	for (uint32_t i = 0; i < 4; i++) {
+		uint32_t offset = i * (worksize >> 2);
+clSetKernelArg(clState->yescrypt_gpu_hash_k1, 2, sizeof(uint32_t), &offset);
+		status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k1, 2, NULL, grid2, block2, 0, NULL, NULL);
+
+		for (uint32_t j = 0; j < 4; j++) {
+			uint32_t offset1 = j * (worksize >> 4);
+			uint32_t offset2 = (i * 4 + j) * (worksize >> 4);
+		    clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 3, sizeof(uint32_t), &offset1);
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 4, sizeof(uint32_t), &offset2);
+			status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k2c_r8, 2, NULL, grid3, block3, 0, NULL, NULL);
+
+		    clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 3, sizeof(uint32_t), &offset1);
+			clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 4, sizeof(uint32_t), &offset2);
+			status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k2c1_r8, 2, NULL, grid3, block3, 0, NULL, NULL);
+		}
+	}
+
+	status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k5, 1, NULL, grid, block, 0, NULL, NULL);
 }
   status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
     buffersize, thrdata->res, 0, NULL, NULL);
@@ -1639,6 +1673,8 @@ static void opencl_thread_shutdown(struct thr_info *thr)
 	clReleaseMemObject(clState->buffer2);
 	if (clState->buffer3)
 	clReleaseMemObject(clState->buffer3);
+  if (clState->buffer4)
+	clReleaseMemObject(clState->buffer4);
   if (clState->MidstateBuf)
 	clReleaseMemObject(clState->MidstateBuf);
   if (clState->MatrixBuf)

diff --git a/kernel/yescrypt.cl b/kernel/yescrypt.cl
@@ -41,7 +41,7 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri
 
 	__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
 	__global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
-	__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));