Skip to content

Commit

Permalink
split the scratchpad buffer into two buffer
Browse files Browse the repository at this point in the history
This pull request will be allow to use more than 2k threads within one gpu thread.
With this PR there should be no need to spawn to cpu threads per vega gpu.
  • Loading branch information
psychocrypt committed Feb 12, 2018
1 parent 29bac54 commit 675134c
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 40 deletions.
66 changes: 48 additions & 18 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,13 +269,28 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
}

size_t g_thd = ctx->rawIntensity;
ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
if(ret != CL_SUCCESS)
size_t halfIntensity = (((g_thd / 2u) + ctx->workSize - 1u) / ctx->workSize) * ctx->workSize;
size_t i = 0;
for(; i < g_thd / halfIntensity; ++i)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
return ERR_OCL_API;
ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * halfIntensity, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
return ERR_OCL_API;
}
}
if((g_thd / halfIntensity) * size_t(halfIntensity) < g_thd)
{
// this code is only called if the second memory is not created by the memory init loop
size_t leftOverThd = g_thd - (g_thd / halfIntensity) * halfIntensity;
ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * leftOverThd, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads left over buffer.", err_to_str(ret));
return ERR_OCL_API;
}
}

ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
if(ret != CL_SUCCESS)
{
Expand Down Expand Up @@ -730,21 +745,26 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
}

// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}
if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}

// States
if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
return ERR_OCL_API;
}

// Threads
if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
return(ERR_OCL_API);
Expand All @@ -753,71 +773,81 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
// CN2 Kernel

// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}

// States
if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}

// Threads
if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
return(ERR_OCL_API);
}

// CN3 Kernel
// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}

// States
if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 0
if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 1
if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 2
if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 3
if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
return ERR_OCL_API;
}

// Threads
if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
return(ERR_OCL_API);
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/amd_gpu/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#define ERR_STUPID_PARAMS (1)



struct GpuContext
{
/*Input vars*/
Expand All @@ -32,6 +31,7 @@ struct GpuContext
cl_mem InputBuffer;
cl_mem OutputBuffer;
cl_mem ExtraBuffers[6];
cl_mem Scratch[2];
cl_program Program;
cl_kernel Kernels[7];
size_t freeMem;
Expand Down
63 changes: 42 additions & 21 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,11 @@ void AESExpandKey256(uint *keybuf)
#if(STRIDED_INDEX==0)
# define IDX(x) (x)
#else
# define IDX(x) ((x) * (Threads))
# define IDX(x) ((x) * (Threads2))
#endif

__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
{
ulong State[25];
uint ExpandedKey1[256];
Expand All @@ -440,17 +440,24 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul

barrier(CLK_LOCAL_MEM_FENCE);

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;

const ulong memSlot = gIdx / halfIntensity;
__global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;

ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);

#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
#else
Scratchpad += gIdx;
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
#endif

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;

((ulong8 *)State)[0] = vload8(0, input);
State[8] = input[8];
State[9] = input[9];
Expand Down Expand Up @@ -504,7 +511,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
__kernel void cn1(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
Expand All @@ -523,16 +530,23 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
barrier(CLK_LOCAL_MEM_FENCE);

uint4 b_x;


size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;

const ulong memSlot = gIdx / halfIntensity;
__global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);

#if(STRIDED_INDEX==0)
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
#else
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
#endif

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
Scratchpad += gIdx;
#endif

a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
Expand Down Expand Up @@ -575,7 +589,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
}

__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
__kernel void cn2(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
{
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
uint ExpandedKey2[256];
Expand All @@ -597,15 +611,22 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u

barrier(CLK_LOCAL_MEM_FENCE);

size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;

const ulong memSlot = gIdx / halfIntensity;
__global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);

#if(STRIDED_INDEX==0)
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
#else
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
#endif

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
Scratchpad += gIdx;
#endif

#if defined(__Tahiti__) || defined(__Pitcairn__)

Expand Down

0 comments on commit 675134c

Please sign in to comment.