split the scratchpad buffer into two buffer

This pull request will be allow to use more than 2k threads within one gpu thread. With this PR there should be no need to spawn to cpu threads per vega gpu.
fireice-uk · Feb 12, 2018 · 675134c · 675134c
1 parent 29bac54
commit 675134c
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 40 deletions.
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -269,13 +269,28 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 	size_t g_thd = ctx->rawIntensity;
-	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
-	if(ret != CL_SUCCESS)
+	size_t halfIntensity = (((g_thd / 2u) + ctx->workSize - 1u) / ctx->workSize) * ctx->workSize;
+	size_t i = 0;
+	for(; i < g_thd / halfIntensity; ++i)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
-		return ERR_OCL_API;
+		ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * halfIntensity, NULL, &ret);
+		if(ret != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+	}
+	if((g_thd / halfIntensity) * size_t(halfIntensity) < g_thd)
+	{
+		// this code is only called if the second memory is not created by the memory init loop
+		size_t leftOverThd = g_thd - (g_thd / halfIntensity) * halfIntensity;
+		ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * leftOverThd, NULL, &ret);
+		if(ret != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads left over buffer.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
 	}
-
 	ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
@@ -730,21 +745,26 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -753,71 +773,81 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	// CN2 Kernel
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+	if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -17,7 +17,6 @@
 #define ERR_STUPID_PARAMS (1)
 
 
-
 struct GpuContext
 {
 	/*Input vars*/
@@ -32,6 +31,7 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
+	cl_mem Scratch[2];
 	cl_program Program;
 	cl_kernel Kernels[7];
 	size_t freeMem;

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -414,11 +414,11 @@ void AESExpandKey256(uint *keybuf)
 #if(STRIDED_INDEX==0)
 #   define IDX(x)	(x)
 #else
-#   define IDX(x)	((x) * (Threads))
+#   define IDX(x)	((x) * (Threads2))
 #endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
 {
 	ulong State[25];
 	uint ExpandedKey1[256];
@@ -440,17 +440,24 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
-	// do not use early return here
-	if(gIdx < Threads)
-	{
-		states += 25 * gIdx;
+	size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;
+
+	const ulong memSlot =  gIdx / halfIntensity;
+    __global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
+
+	ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);
 
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
 #else
-		Scratchpad += gIdx;
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
 #endif
 
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+
 		((ulong8 *)State)[0] = vload8(0, input);
 		State[8] = input[8];
 		State[9] = input[9];
@@ -504,7 +511,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void cn1(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
 {
 	ulong a[2], b[2];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
@@ -523,16 +530,23 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	uint4 b_x;
-
+
+	size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;
+
+	const ulong memSlot =  gIdx / halfIntensity;
+    __global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
+	ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);
+
+#if(STRIDED_INDEX==0)
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
+#else
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
+#endif
+
 	// do not use early return here
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
-#if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
-		Scratchpad += gIdx;
-#endif
 
 		a[0] = states[0] ^ states[4];
 		b[0] = states[2] ^ states[6];
@@ -575,7 +589,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
+__kernel void cn2(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint ExpandedKey2[256];
@@ -597,15 +611,22 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
+	size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;
+
+	const ulong memSlot =  gIdx / halfIntensity;
+    __global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
+	ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);
+
+#if(STRIDED_INDEX==0)
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
+#else
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
+#endif
+
 	// do not use early return here
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
-#if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
-		Scratchpad += gIdx;
-#endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)