Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] allow one thread for vega gpus #624

Open
wants to merge 1 commit into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 48 additions & 18 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,13 +269,28 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
}

size_t g_thd = ctx->rawIntensity;
ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
if(ret != CL_SUCCESS)
size_t halfIntensity = (((g_thd / 2u) + ctx->workSize - 1u) / ctx->workSize) * ctx->workSize;
size_t i = 0;
for(; i < g_thd / halfIntensity; ++i)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
return ERR_OCL_API;
ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * halfIntensity, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
return ERR_OCL_API;
}
}
if((g_thd / halfIntensity) * size_t(halfIntensity) < g_thd)
{
// this code is only called if the second memory is not created by the memory init loop
size_t leftOverThd = g_thd - (g_thd / halfIntensity) * halfIntensity;
ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * leftOverThd, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads left over buffer.", err_to_str(ret));
return ERR_OCL_API;
}
}

ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
if(ret != CL_SUCCESS)
{
Expand Down Expand Up @@ -730,21 +745,26 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
}

// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}
if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}

// States
if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
return ERR_OCL_API;
}

// Threads
if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
return(ERR_OCL_API);
Expand All @@ -753,71 +773,81 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
// CN2 Kernel

// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}

// States
if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}

// Threads
if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
return(ERR_OCL_API);
}

// CN3 Kernel
// Scratchpads
if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}
if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
return ERR_OCL_API;
}

// States
if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 0
if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 1
if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 2
if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
return ERR_OCL_API;
}

// Branch 3
if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
return ERR_OCL_API;
}

// Threads
if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
if((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
return(ERR_OCL_API);
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/amd_gpu/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
#define ERR_STUPID_PARAMS (1)



struct GpuContext
{
/*Input vars*/
Expand All @@ -32,6 +31,7 @@ struct GpuContext
cl_mem InputBuffer;
cl_mem OutputBuffer;
cl_mem ExtraBuffers[6];
cl_mem Scratch[2];
cl_program Program;
cl_kernel Kernels[7];
size_t freeMem;
Expand Down
63 changes: 42 additions & 21 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,11 @@ void AESExpandKey256(uint *keybuf)
#if(STRIDED_INDEX==0)
# define IDX(x) (x)
#else
# define IDX(x) ((x) * (Threads))
# define IDX(x) ((x) * (Threads2))
#endif

__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
{
ulong State[25];
uint ExpandedKey1[256];
Expand All @@ -440,17 +440,24 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul

barrier(CLK_LOCAL_MEM_FENCE);

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;

const ulong memSlot = gIdx / halfIntensity;
__global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;

ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);

#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
#else
Scratchpad += gIdx;
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
#endif

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;

((ulong8 *)State)[0] = vload8(0, input);
State[8] = input[8];
State[9] = input[9];
Expand Down Expand Up @@ -504,7 +511,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
}

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
__kernel void cn1(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
{
ulong a[2], b[2];
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
Expand All @@ -523,16 +530,23 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
barrier(CLK_LOCAL_MEM_FENCE);

uint4 b_x;


size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;

const ulong memSlot = gIdx / halfIntensity;
__global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);

#if(STRIDED_INDEX==0)
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
#else
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
#endif

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
Scratchpad += gIdx;
#endif

a[0] = states[0] ^ states[4];
b[0] = states[2] ^ states[6];
Expand Down Expand Up @@ -575,7 +589,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
}

__attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
__kernel void cn2(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
{
__local uint AES0[256], AES1[256], AES2[256], AES3[256];
uint ExpandedKey2[256];
Expand All @@ -597,15 +611,22 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u

barrier(CLK_LOCAL_MEM_FENCE);

size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;

const ulong memSlot = gIdx / halfIntensity;
__global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);

#if(STRIDED_INDEX==0)
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
#else
__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
#endif

// do not use early return here
if(gIdx < Threads)
{
states += 25 * gIdx;
#if(STRIDED_INDEX==0)
Scratchpad += gIdx * (ITERATIONS >> 2);
#else
Scratchpad += gIdx;
#endif

#if defined(__Tahiti__) || defined(__Pitcairn__)

Expand Down