Skip to content

Commit

Permalink
Merge pull request #203 from fancyIX/feature/#201
Browse files Browse the repository at this point in the history
Issue #201 [yescrypt]
  • Loading branch information
fancyIX committed Nov 14, 2020
2 parents 3acc616 + b3ecb4c commit 91e924e
Show file tree
Hide file tree
Showing 10 changed files with 1,033 additions and 9 deletions.
75 changes: 75 additions & 0 deletions algorithm.c
Expand Up @@ -445,6 +445,76 @@ static cl_int queue_yescrypt_multikernel(_clState *clState, dev_blk_ctx *blk, __
CL_SET_ARG(le_target);

return status;
}

static cl_int queue_yescrypt_navikernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
{
cl_uint le_target;
cl_int status = 0;

le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
flip80(clState->cldata, blk->work->data);
uint32_t h[8], data[32];

h[0] = 0x6A09E667; h[1] = 0xBB67AE85; h[2] = 0x3C6EF372; h[3] = 0xA54FF53A;
h[4] = 0x510E527F; h[5] = 0x9B05688C; h[6] = 0x1F83D9AB; h[7] = 0x5BE0CD19;
data[0] = ((uint32_t*)(clState->cldata))[0]; data[1] = ((uint32_t*)(clState->cldata))[1];
data[2] = ((uint32_t*)(clState->cldata))[2]; data[3] = ((uint32_t*)(clState->cldata))[3];
data[4] = ((uint32_t*)(clState->cldata))[4]; data[5] = ((uint32_t*)(clState->cldata))[5];
data[6] = ((uint32_t*)(clState->cldata))[6]; data[7] = ((uint32_t*)(clState->cldata))[7];
data[8] = ((uint32_t*)(clState->cldata))[8]; data[9] = ((uint32_t*)(clState->cldata))[9];
data[10] = ((uint32_t*)(clState->cldata))[10]; data[11] = ((uint32_t*)(clState->cldata))[11];
data[12] = ((uint32_t*)(clState->cldata))[12]; data[13] = ((uint32_t*)(clState->cldata))[13];
data[14] = ((uint32_t*)(clState->cldata))[14]; data[15] = ((uint32_t*)(clState->cldata))[15];
for (int i = 0; i<20; i++) { data[i] = htobe32(data[i]); }
sha256_round_body_host(data, h);

size_t worksize = blk->work->thr->cgpu->thread_concurrency;
threads = worksize;

status=clEnqueueWriteBuffer(clState->commandQueue, clState->padbuffer8, CL_TRUE, 0, 32, h, 0, NULL, NULL);
status=clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, CL_TRUE, 0, 80, clState->cldata, 0, NULL, NULL);
if (status != CL_SUCCESS) {
printf("status %d: clEnqueueWriteBuffer\n", status);
exit(1);
}

clSetKernelArg(clState->yescrypt_gpu_hash_k0, 0, sizeof(clState->padbuffer8), &(clState->padbuffer8));
clSetKernelArg(clState->yescrypt_gpu_hash_k0, 1, sizeof(clState->CLbuffer0), &(clState->CLbuffer0));
clSetKernelArg(clState->yescrypt_gpu_hash_k0, 2, sizeof(clState->buffer1), &(clState->buffer1));
clSetKernelArg(clState->yescrypt_gpu_hash_k0, 3, sizeof(clState->buffer4), &(clState->buffer4));
clSetKernelArg(clState->yescrypt_gpu_hash_k0, 4, sizeof(threads), &threads);

clSetKernelArg(clState->yescrypt_gpu_hash_k1, 0, sizeof(clState->buffer1), &(clState->buffer1));
clSetKernelArg(clState->yescrypt_gpu_hash_k1, 1, sizeof(clState->buffer2), &(clState->buffer2));

clSetKernelArg(clState->yescrypt_gpu_hash_k1, 3, sizeof(threads), &threads);

clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 0, sizeof(clState->buffer1), &(clState->buffer1));
clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 1, sizeof(clState->buffer2), &(clState->buffer2));
clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 2, sizeof(clState->buffer3), &(clState->buffer3));

clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 5, sizeof(threads), &threads);

clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 0, sizeof(clState->buffer1), &(clState->buffer1));
clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 1, sizeof(clState->buffer2), &(clState->buffer2));
clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 2, sizeof(clState->buffer3), &(clState->buffer3));

clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 5, sizeof(threads), &threads);

clSetKernelArg(clState->yescrypt_gpu_hash_k5, 0, sizeof(clState->CLbuffer0), &(clState->CLbuffer0));
clSetKernelArg(clState->yescrypt_gpu_hash_k5, 1, sizeof(clState->buffer1), &(clState->buffer1));
clSetKernelArg(clState->yescrypt_gpu_hash_k5, 2, sizeof(clState->buffer4), &(clState->buffer4));
clSetKernelArg(clState->yescrypt_gpu_hash_k5, 3, sizeof(clState->outputBuffer), &(clState->outputBuffer));

clSetKernelArg(clState->yescrypt_gpu_hash_k5, 5, sizeof(threads), &threads);
clSetKernelArg(clState->yescrypt_gpu_hash_k5, 4, sizeof(uint32_t), &le_target);






}

static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
Expand Down Expand Up @@ -2294,6 +2364,11 @@ static algorithm_settings_t algos[] = {
A_YESCRYPT_MULTI("yescrypt-multi"),
#undef A_YESCRYPT_MULTI

#define A_YESCRYPT_NAVI(a) \
{ a, ALGO_YESCRYPT_NAVI, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0,-1,CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE , yescrypt_regenhash, NULL, NULL, queue_yescrypt_navikernel, gen_hash, append_neoscrypt_compiler_options}
A_YESCRYPT_NAVI("yescryptf"),
#undef A_YESCRYPT_NAVI

// kernels starting from this will have difficulty calculated by using quarkcoin algorithm
#define A_QUARK(a, b) \
{ a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, NULL, NULL, queue_sph_kernel, gen_hash, append_x11_compiler_options }
Expand Down
1 change: 1 addition & 0 deletions algorithm.h
Expand Up @@ -43,6 +43,7 @@ typedef enum {
ALGO_PLUCK,
ALGO_YESCRYPT,
ALGO_YESCRYPT_MULTI,
ALGO_YESCRYPT_NAVI,
ALGO_BLAKECOIN,
ALGO_BLAKE,
ALGO_SIA,
Expand Down
120 changes: 119 additions & 1 deletion algorithm/yescrypt.c
Expand Up @@ -116,4 +116,122 @@ bool scanhash_yescrypt(struct thr_info *thr, const unsigned char __maybe_unused
}

return ret;
}
}

#define SPH_T32(x) (x)
#define ROTR(x, n) ((x >> n) | (x << (32 - n)))

static const uint32_t cpu_K[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};

static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t *pd,
uint32_t e, uint32_t f, uint32_t g, uint32_t *ph,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1, t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);

t1 = *ph + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
*pd = *pd + t1;
*ph = t1 + t2;
}

static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t *pd,
uint32_t e, uint32_t f, uint32_t g, uint32_t *ph,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1, t2;

int pcidx1 = (pc - 2) & 0xF;
int pcidx2 = (pc - 7) & 0xF;
int pcidx3 = (pc - 15) & 0xF;

uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];

uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);

in[pc] = ssg21 + inx2 + ssg20 + inx0;

t1 = *ph + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
*pd = *pd + t1;
*ph = t1 + t2;
}

void sha256_round_body_host(uint32_t* in, uint32_t* state)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];

sha256_step1_host(a, b, c, &d, e, f, g, &h, in[0], cpu_K[0]);
sha256_step1_host(h, a, b, &c, d, e, f, &g, in[1], cpu_K[1]);
sha256_step1_host(g, h, a, &b, c, d, e, &f, in[2], cpu_K[2]);
sha256_step1_host(f, g, h, &a, b, c, d, &e, in[3], cpu_K[3]);
sha256_step1_host(e, f, g, &h, a, b, c, &d, in[4], cpu_K[4]);
sha256_step1_host(d, e, f, &g, h, a, b, &c, in[5], cpu_K[5]);
sha256_step1_host(c, d, e, &f, g, h, a, &b, in[6], cpu_K[6]);
sha256_step1_host(b, c, d, &e, f, g, h, &a, in[7], cpu_K[7]);
sha256_step1_host(a, b, c, &d, e, f, g, &h, in[8], cpu_K[8]);
sha256_step1_host(h, a, b, &c, d, e, f, &g, in[9], cpu_K[9]);
sha256_step1_host(g, h, a, &b, c, d, e, &f, in[10], cpu_K[10]);
sha256_step1_host(f, g, h, &a, b, c, d, &e, in[11], cpu_K[11]);
sha256_step1_host(e, f, g, &h, a, b, c, &d, in[12], cpu_K[12]);
sha256_step1_host(d, e, f, &g, h, a, b, &c, in[13], cpu_K[13]);
sha256_step1_host(c, d, e, &f, g, h, a, &b, in[14], cpu_K[14]);
sha256_step1_host(b, c, d, &e, f, g, h, &a, in[15], cpu_K[15]);

for (int i = 0; i < 3; i++)
{
sha256_step2_host(a, b, c, &d, e, f, g, &h, in, 0, cpu_K[16 + 16 * i]);
sha256_step2_host(h, a, b, &c, d, e, f, &g, in, 1, cpu_K[17 + 16 * i]);
sha256_step2_host(g, h, a, &b, c, d, e, &f, in, 2, cpu_K[18 + 16 * i]);
sha256_step2_host(f, g, h, &a, b, c, d, &e, in, 3, cpu_K[19 + 16 * i]);
sha256_step2_host(e, f, g, &h, a, b, c, &d, in, 4, cpu_K[20 + 16 * i]);
sha256_step2_host(d, e, f, &g, h, a, b, &c, in, 5, cpu_K[21 + 16 * i]);
sha256_step2_host(c, d, e, &f, g, h, a, &b, in, 6, cpu_K[22 + 16 * i]);
sha256_step2_host(b, c, d, &e, f, g, h, &a, in, 7, cpu_K[23 + 16 * i]);
sha256_step2_host(a, b, c, &d, e, f, g, &h, in, 8, cpu_K[24 + 16 * i]);
sha256_step2_host(h, a, b, &c, d, e, f, &g, in, 9, cpu_K[25 + 16 * i]);
sha256_step2_host(g, h, a, &b, c, d, e, &f, in, 10, cpu_K[26 + 16 * i]);
sha256_step2_host(f, g, h, &a, b, c, d, &e, in, 11, cpu_K[27 + 16 * i]);
sha256_step2_host(e, f, g, &h, a, b, c, &d, in, 12, cpu_K[28 + 16 * i]);
sha256_step2_host(d, e, f, &g, h, a, b, &c, in, 13, cpu_K[29 + 16 * i]);
sha256_step2_host(c, d, e, &f, g, h, a, &b, in, 14, cpu_K[30 + 16 * i]);
sha256_step2_host(b, c, d, &e, f, g, h, &a, in, 15, cpu_K[31 + 16 * i]);
}

state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
3 changes: 3 additions & 0 deletions algorithm/yescrypt.h
Expand Up @@ -2,9 +2,12 @@
#define YESCRYPT_H

#include "miner.h"
#define YESCRYPT_NAVI_SCRATCHBUF_SIZE (2 * 2048 * 8 * 4 + 2 * 16 * 8 * 4 + 512 * 4 + 8 * 4 + 32) //uchar
#define YESCRYPT_SCRATCHBUF_SIZE (128 * 2048 * 8 ) //uchar
#define YESCRYP_SECBUF_SIZE (128*64*8)
extern int yescrypt_test(unsigned char *pdata, const unsigned char *ptarget, uint32_t nonce);
extern void yescrypt_regenhash(struct work *work);

void sha256_round_body_host(uint32_t* in, uint32_t* state);

#endif /* YESCRYPT_H */
38 changes: 37 additions & 1 deletion driver-opencl.c
Expand Up @@ -1432,7 +1432,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
if (clState->goffset)
p_global_work_offset = (size_t *)&work->blk.nonce;

if (gpu->algorithm.type != ALGO_MTP) {
if (gpu->algorithm.type != ALGO_MTP && gpu->algorithm.type != ALGO_YESCRYPT_NAVI) {
if (gpu->algorithm.type == ALGO_ARGON2D) {
const uint32_t throughput = gpu->throughput;
const size_t global[] = { 16, throughput };
Expand Down Expand Up @@ -1578,6 +1578,40 @@ if (gpu->algorithm.type != ALGO_MTP) {
return -1;
}
}
}
if (gpu->algorithm.type == ALGO_YESCRYPT_NAVI) {
cl_uint threads = globalThreads[0];
size_t worksize = globalThreads[0];

uint32_t tpb = 32U;
size_t grid[1] = {worksize};
size_t grid2[2] = {worksize / tpb * 4U, tpb >> 2};
size_t grid3[2] = {worksize / tpb * 16U, tpb >> 4};
size_t block[1] = {tpb};
size_t block2[2] = {4U, tpb >> 2};
size_t block3[2] = {16U, tpb >> 4};

status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k0, 1, NULL, grid, block, 0, NULL, NULL);

for (uint32_t i = 0; i < 4; i++) {
uint32_t offset = i * (worksize >> 2);
clSetKernelArg(clState->yescrypt_gpu_hash_k1, 2, sizeof(uint32_t), &offset);
status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k1, 2, NULL, grid2, block2, 0, NULL, NULL);

for (uint32_t j = 0; j < 4; j++) {
uint32_t offset1 = j * (worksize >> 4);
uint32_t offset2 = (i * 4 + j) * (worksize >> 4);
clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 3, sizeof(uint32_t), &offset1);
clSetKernelArg(clState->yescrypt_gpu_hash_k2c_r8, 4, sizeof(uint32_t), &offset2);
status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k2c_r8, 2, NULL, grid3, block3, 0, NULL, NULL);

clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 3, sizeof(uint32_t), &offset1);
clSetKernelArg(clState->yescrypt_gpu_hash_k2c1_r8, 4, sizeof(uint32_t), &offset2);
status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k2c1_r8, 2, NULL, grid3, block3, 0, NULL, NULL);
}
}

status=clEnqueueNDRangeKernel(clState->commandQueue, clState->yescrypt_gpu_hash_k5, 1, NULL, grid, block, 0, NULL, NULL);
}
status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
buffersize, thrdata->res, 0, NULL, NULL);
Expand Down Expand Up @@ -1639,6 +1673,8 @@ static void opencl_thread_shutdown(struct thr_info *thr)
clReleaseMemObject(clState->buffer2);
if (clState->buffer3)
clReleaseMemObject(clState->buffer3);
if (clState->buffer4)
clReleaseMemObject(clState->buffer4);
if (clState->MidstateBuf)
clReleaseMemObject(clState->MidstateBuf);
if (clState->MatrixBuf)
Expand Down
2 changes: 1 addition & 1 deletion kernel/yescrypt.cl
Expand Up @@ -41,7 +41,7 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri

__global ulong16 *hashbuffer = (__global ulong16 *)(padcache + (2048 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *prevstate = (__global ulong16 *)(buff1 + (64 * 128 * sizeof(ulong)*(get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (8 * 128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));
__global ulong16 *Bdev = (__global ulong16 *)(buff2 + (128 * sizeof(ulong)* (get_global_id(0) % MAX_GLOBAL_THREADS)));



Expand Down

0 comments on commit 91e924e

Please sign in to comment.