diff --git a/algorithm.c b/algorithm.c index ec866b7..46b0c2d 100644 --- a/algorithm.c +++ b/algorithm.c @@ -1246,6 +1246,7 @@ static cl_int queue_mtp_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unus else buffer->StartNonce = 0; + //////////////////////////////////////////////// if (buffer->prev_job_id!=NULL) { @@ -1423,24 +1424,17 @@ static cl_int queue_mtp_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unus if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d while creating mtp_fc kernel", status); } - -// printf("Step 1 : Compute F(I) and store its T blocks X[1], X[2], ..., X[T] in the memory \n"); size_t mtp_tree_size = 2 * 1048576 * 4 * sizeof(uint64_t); clEnqueueReadBuffer(clState->commandQueue, buffer->tree, CL_TRUE, 0, mtp_tree_size, mtp->dx, 0, NULL, NULL); -// printf("Step 2 : Compute the root Φ of the Merkle hash tree \n"); + mtp->ordered_tree = new MerkleTree(mtp->dx, true); -// JobId[thr_id] = work->data[17]; -// XtraNonce2[thr_id] = ((uint64_t*)work->xnonce2)[0]; blk->work->prev_job_id = blk->work->job_id; pool->swork.prev_job_id = pool->swork.job_id; buffer->prev_job_id = pool->swork.job_id; MerkleTree::Buffer root = mtp->ordered_tree->getRoot(); std::copy(root.begin(), root.end(), mtp->TheMerkleRoot); -// mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id]); -// printf("merkleroot %08x %08x %08x %08x \n", ((uint32_t*)mtp->TheMerkleRoot)[0], ((uint32_t*)mtp->TheMerkleRoot)[1], ((uint32_t*)mtp->TheMerkleRoot)[2], ((uint32_t*)mtp->TheMerkleRoot)[3]); -// clFinish(clState->commandQueue); root.resize(0); } @@ -1463,7 +1457,7 @@ static cl_int queue_mtp_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unus uint32_t rawint = 2 << (blk->work->thr->cgpu->intensity - 1); kernel = &clState->mtp_yloop; size_t Global2 = rawint ; //1048576; //65536; - size_t Local2 = 256; + size_t Local2 = 64; size_t buffersize = 1024; num = 0; CL_SET_ARG(clState->CLbuffer0); @@ -1479,7 +1473,7 @@ static cl_int queue_mtp_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unus applog(LOG_ERR, "Error %d with kernel mtp_yloop.", status); } - status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, buffersize, Solution, 0, NULL, NULL); + status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, buffersize, Solution, 0, NULL, NULL); buffer->StartNonce += rawint; if (Solution[0xff]) { uint256 TheUint256Target[1]; @@ -1503,9 +1497,12 @@ static cl_int queue_mtp_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unus blk->work->mtpPOW.TheNonce = Solution[0]; ((uint32_t*)blk->work->data)[19] = Solution[0]; // printf("*************************************************************************************Found a solution\n"); - } else - printf("*************************************************************************************Not a solution\n"); - + } +else { + Solution[0xff]=0; + status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, buffersize, Solution, 0, NULL, NULL); + printf("*************************************************************************************Not a solution\n"); +} } //printf("after mtp_yloop\n"); // if (status != CL_SUCCESS) diff --git a/driver-opencl.c b/driver-opencl.c index 6a48bc3..9b285cb 100644 --- a/driver-opencl.c +++ b/driver-opencl.c @@ -1346,7 +1346,7 @@ applog(LOG_DEBUG, "****************coming to opencl_thread_init **************** applog(LOG_ERR, "Failed to calloc in opencl_thread_init"); return false; } -if (clState!=NULL) + status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, BUFFERSIZE, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { diff --git a/kernel/mtp.cl b/kernel/mtp.cl index 7d7c593..d59dbb1 100644 --- a/kernel/mtp.cl +++ b/kernel/mtp.cl @@ -608,11 +608,14 @@ static unsigned warp_id() return ret; } #endif +#define LEN 8 +#define DIV 256 +//#define FARLOAD(x) far[warp][(x) + lane*(LEN+SHR_OFF)] +#define FARSTORE(x) far[warp][lane + (x)*(LEN+SHR_OFF)] +#define FARLOAD(x) FarReg[(x)] -#define FARLOAD(x) far[warp][(x)*(8+SHR_OFF) + lane] -#define FARSTORE(x) far[warp][lane*(8+SHR_OFF) + (x)] -#define SHR_OFF 1 -#define TPB_MTP 256 +#define SHR_OFF 0 +#define TPB_MTP 64 __attribute__((reqd_work_group_size(TPB_MTP, 1, 1))) __kernel void mtp_yloop(__global unsigned int* pData, __global const uint4 * __restrict__ DBlock, __global const uint4 * __restrict__ DBlock2, @@ -625,10 +628,11 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint uint32_t event_thread = get_global_id(0) - get_global_offset(0); //thread / ThreadNumber; uint32_t NonceIterator = get_global_id(0); - int lane = get_local_id(0) % 8; - int warp = get_local_id(0) / 8;;//warp_id(); - __local ulong2 far[TPB_MTP / 8][8 * (8 + SHR_OFF)]; - __local uint32_t farIndex[TPB_MTP / 8][8]; + int lane = get_local_id(0) % DIV; + int warp = get_local_id(0) / DIV;;//warp_id(); +// __local ulong2 far[TPB_MTP/ DIV][256 * (LEN + SHR_OFF)]; + ulong2 FarReg[8]; + uint32_t farIndex; const uint32_t half_memcost = 2 * 1024 * 1024; const uint64_t lblakeFinal[8] = { @@ -647,18 +651,9 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint uint8 YLocalPrint; ulong8 DataChunk[2] = { 0 }; - /* - ((uint4*)DataChunk)[0] = ((uint4*)pData)[0]; - ((uint4*)DataChunk)[1] = ((uint4*)pData)[1]; - - ((uint4*)DataChunk)[2] = ((uint4*)pData)[2]; - ((uint4*)DataChunk)[3] = ((uint4*)pData)[3]; - */ - + ((uint8 *)DataChunk)[0] = ((__global uint8 *)pData)[0]; ((uint8 *)DataChunk)[1] = ((__global uint8 *)pData)[1]; - //((uint2x4 *)DataChunk)[0] = __ldg4(&((uint2x4 *)pData)[0]); - //((uint2x4 *)DataChunk)[1] = __ldg4(&((uint2x4 *)pData)[1]); ((uint4*)DataChunk)[4] = ((__global uint4*)pData)[4]; ((uint4*)DataChunk)[5] = ((__global uint4*)Elements)[0]; @@ -690,7 +685,7 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint FARLOAD(t + 6) = D[t]; } - farIndex[warp][lane] = YLocal.s0 & 0x3FFFFF; + farIndex = YLocal.s0 & 0x3FFFFF; barrier(CLK_LOCAL_MEM_FENCE); ulong8 DataChunk[2]; @@ -710,7 +705,7 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint ulong2 *D = (ulong2*)&YLocal; D[t] = FARLOAD(t + 6); } - + barrier(CLK_LOCAL_MEM_FENCE); len += last ? 32 : 128; @@ -721,10 +716,11 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint #pragma unroll for (int t = 0; t<8; t++) { - __global ulong2 *farP = (farIndex[warp][t]