fireice-uk · psychocrypt · Feb 12, 2018
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -269,13 +269,28 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	}
 
 	size_t g_thd = ctx->rawIntensity;
-	ctx->ExtraBuffers[0] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * g_thd, NULL, &ret);
-	if(ret != CL_SUCCESS)
+	size_t halfIntensity = (((g_thd / 2u) + ctx->workSize - 1u) / ctx->workSize) * ctx->workSize;
+	size_t i = 0;
+	for(; i < g_thd / halfIntensity; ++i)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
-		return ERR_OCL_API;
+		ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * halfIntensity, NULL, &ret);
+		if(ret != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads buffer.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
+	}
+	if((g_thd / halfIntensity) * size_t(halfIntensity) < g_thd)
+	{
+		// this code is only called if the second memory is not created by the memory init loop
+		size_t leftOverThd = g_thd - (g_thd / halfIntensity) * halfIntensity;
+		ctx->Scratch[i] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, hashMemSize * leftOverThd, NULL, &ret);
+		if(ret != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create hash scratchpads left over buffer.", err_to_str(ret));
+			return ERR_OCL_API;
+		}
 	}
-
 	ctx->ExtraBuffers[1] = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, 200 * g_thd, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
@@ -730,21 +745,26 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	}
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[0], 1, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[0], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[0], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[0], 4, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 0, argument 3.", err_to_str(ret));
 		return(ERR_OCL_API);
@@ -753,71 +773,81 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 	// CN2 Kernel
 
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[1], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+	if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[1], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[1], 2, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[1], 3, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 1, argument 2.", err_to_str(ret));
 		return(ERR_OCL_API);
 	}
 
 	// CN3 Kernel
 	// Scratchpads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->ExtraBuffers + 0)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 0, sizeof(cl_mem), ctx->Scratch + 0)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->Scratch + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 0.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// States
-	if((ret = clSetKernelArg(ctx->Kernels[2], 1, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 1)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 1.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 0
-	if((ret = clSetKernelArg(ctx->Kernels[2], 2, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 2)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 2.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 1
-	if((ret = clSetKernelArg(ctx->Kernels[2], 3, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 3)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 3.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 2
-	if((ret = clSetKernelArg(ctx->Kernels[2], 4, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 4)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 4.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Branch 3
-	if((ret = clSetKernelArg(ctx->Kernels[2], 5, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_mem), ctx->ExtraBuffers + 5)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 5.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
 	// Threads
-	if((ret = clSetKernelArg(ctx->Kernels[2], 6, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
+	if((ret = clSetKernelArg(ctx->Kernels[2], 7, sizeof(cl_ulong), &numThreads)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel 2, argument 6.", err_to_str(ret));
 		return(ERR_OCL_API);

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -17,7 +17,6 @@
 #define ERR_STUPID_PARAMS (1)
 
 
-
 struct GpuContext
 {
 	/*Input vars*/
@@ -32,6 +31,7 @@ struct GpuContext
 	cl_mem InputBuffer;
 	cl_mem OutputBuffer;
 	cl_mem ExtraBuffers[6];
+	cl_mem Scratch[2];
 	cl_program Program;
 	cl_kernel Kernels[7];
 	size_t freeMem;

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -414,11 +414,11 @@ void AESExpandKey256(uint *keybuf)
 #if(STRIDED_INDEX==0)
 #   define IDX(x)	(x)
 #else
-#   define IDX(x)	((x) * (Threads))
+#   define IDX(x)	((x) * (Threads2))
 #endif
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void cn0(__global ulong *input, __global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
 {
 	ulong State[25];
 	uint ExpandedKey1[256];
@@ -440,17 +440,24 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
-	// do not use early return here
-	if(gIdx < Threads)
-	{
-		states += 25 * gIdx;
+	size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;
+
+	const ulong memSlot =  gIdx / halfIntensity;
+    __global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
+
+	ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);
 
 #if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
 #else
-		Scratchpad += gIdx;
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
 #endif
 
+	// do not use early return here
+	if(gIdx < Threads)
+	{
+		states += 25 * gIdx;
+
 		((ulong8 *)State)[0] = vload8(0, input);
 		State[8] = input[8];
 		State[9] = input[9];
@@ -504,7 +511,7 @@ __kernel void cn0(__global ulong *input, __global uint4 *Scratchpad, __global ul
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Threads)
+__kernel void cn1(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, ulong Threads)
 {
 	ulong a[2], b[2];
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
@@ -523,16 +530,23 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 	barrier(CLK_LOCAL_MEM_FENCE);
 
 	uint4 b_x;
-
+
+	size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;
+
+	const ulong memSlot =  gIdx / halfIntensity;
+    __global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
+	ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);
+
+#if(STRIDED_INDEX==0)
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
+#else
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
+#endif
+
 	// do not use early return here
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
-#if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
-		Scratchpad += gIdx;
-#endif
 
 		a[0] = states[0] ^ states[4];
 		b[0] = states[2] ^ states[6];
@@ -575,7 +589,7 @@ __kernel void cn1(__global uint4 *Scratchpad, __global ulong *states, ulong Thre
 }
 
 __attribute__((reqd_work_group_size(WORKSIZE, 8, 1)))
-__kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
+__kernel void cn2(__global uint4 *Scratchpad0,__global uint4 *Scratchpad1, __global ulong *states, __global uint *Branch0, __global uint *Branch1, __global uint *Branch2, __global uint *Branch3, ulong Threads)
 {
 	__local uint AES0[256], AES1[256], AES2[256], AES3[256];
 	uint ExpandedKey2[256];
@@ -597,15 +611,22 @@ __kernel void cn2(__global uint4 *Scratchpad, __global ulong *states, __global u
 
 	barrier(CLK_LOCAL_MEM_FENCE);
 
+	size_t halfIntensity = (((Threads / 2u) + WORKSIZE - 1u) / WORKSIZE) * WORKSIZE;
+
+	const ulong memSlot =  gIdx / halfIntensity;
+    __global uint4 *ScratchpadTmp = memSlot == 0 ? Scratchpad0 : Scratchpad1;
+	ulong Threads2 = memSlot == 0 ? halfIntensity : (Threads - halfIntensity);
+
+#if(STRIDED_INDEX==0)
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity) * (ITERATIONS >> 2);
+#else
+	__global uint4 *Scratchpad = ScratchpadTmp + (gIdx % halfIntensity);
+#endif
+
 	// do not use early return here
 	if(gIdx < Threads)
 	{
 		states += 25 * gIdx;
-#if(STRIDED_INDEX==0)
-		Scratchpad += gIdx * (ITERATIONS >> 2);
-#else
-		Scratchpad += gIdx;
-#endif
 
 		#if defined(__Tahiti__) || defined(__Pitcairn__)