Skip to content

Commit

Permalink
Issue #240
Browse files Browse the repository at this point in the history
Perf gets worse
  • Loading branch information
fancyIX committed Oct 29, 2021
1 parent 12f2de2 commit 7ea5c4c
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 116 deletions.
6 changes: 6 additions & 0 deletions algorithm.c
Original file line number Diff line number Diff line change
Expand Up @@ -749,24 +749,30 @@ static cl_int queue_allium_kernel(struct __clState *clState, struct _dev_blk_ctx
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(threads);
//CL_SET_ARG(clState->buffer2);
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->buffer1);
CL_SET_ARG(threads);
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(threads);
// cubehash - search5
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// lyra2_cuda_hash_64 - search6 7 8
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(threads);
//CL_SET_ARG(clState->buffer2);
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->buffer1);
CL_SET_ARG(threads);
num = 0;
CL_NEXTKERNEL_SET_ARG(clState->padbuffer8);
CL_SET_ARG(clState->buffer1);
CL_SET_ARG(threads);
// skein - search9
CL_NEXTKERNEL_SET_ARG_0(clState->padbuffer8);
// groestl - search10
Expand Down
214 changes: 98 additions & 116 deletions kernel/allium_navi.cl
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,13 @@ barrier(CLK_LOCAL_MEM_FENCE);
/// lyra2 p1

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search2(__global uint* hashes, __global uchar* sharedDataBuf)
__kernel void search2(__global uint* hashes, __global uchar* sharedDataBuf, uint threads)
{
int gid = get_global_id(0);

uint thread = gid - get_global_offset(0);
__global hash2_t *hash = (__global hash2_t *)(hashes + (8* (gid-get_global_offset(0))));
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(0))));
__global ulong4 *lyraState = (__global ulong4 *)(sharedDataBuf);

ulong ttr;

Expand All @@ -258,18 +259,10 @@ __kernel void search2(__global uint* hashes, __global uchar* sharedDataBuf)
roundLyra(state);
}

// state0
lyraState->hl16[0] = state[0];
lyraState->hl16[1] = state[1];
// state1
lyraState->hl16[2] = state[2];
lyraState->hl16[3] = state[3];
// state2
lyraState->hl16[4] = state[4];
lyraState->hl16[5] = state[5];
// state3
lyraState->hl16[6] = state[6];
lyraState->hl16[7] = state[7];
lyraState[threads * 0 + thread] = ((ulong4 *)state)[0];
lyraState[threads * 1 + thread] = ((ulong4 *)state)[1];
lyraState[threads * 2 + thread] = ((ulong4 *)state)[2];
lyraState[threads * 3 + thread] = ((ulong4 *)state)[3];

barrier(CLK_GLOBAL_MEM_FENCE);
}
Expand All @@ -280,10 +273,11 @@ __attribute__((amdgpu_waves_per_eu(1,1)))
__attribute__((amdgpu_num_vgpr(256)))
__attribute__((amdgpu_num_sgpr(200)))
__attribute__((reqd_work_group_size(4, 2, 16)))
__kernel void search3(__global uchar* sharedDataBuf)
__kernel void search3(__global uchar* sharedDataBuf, uint threads)
{
uint gid = get_global_id(2);
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(2))));
uint thread = gid-get_global_offset(2);
__global uint *lyraState = (__global uint *)(sharedDataBuf);

uint notepad[192];

Expand All @@ -306,22 +300,22 @@ __kernel void search3(__global uchar* sharedDataBuf)

//-------------------------------------
// Load Lyra state
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 3) + player]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 3) + player]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 3) + player]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 3) + player]));

write_state(notepad, state, 0, 7);
round_lyra_4way_sw(state);
Expand Down Expand Up @@ -391,48 +385,45 @@ __kernel void search3(__global uchar* sharedDataBuf)

//-------------------------------------
// save lyra state
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 0) lyraState[2 *((0 * threads + thread) * 4 + 0) + player] = state[0];
if (LOCAL_LINEAR == 0) lyraState[2 *((1 * threads + thread) * 4 + 0) + player] = state[1];
if (LOCAL_LINEAR == 0) lyraState[2 *((2 * threads + thread) * 4 + 0) + player] = state[2];
if (LOCAL_LINEAR == 0) lyraState[2 *((3 * threads + thread) * 4 + 0) + player] = state[3];
if (LOCAL_LINEAR == 1) lyraState[2 *((0 * threads + thread) * 4 + 1) + player] = state[0];
if (LOCAL_LINEAR == 1) lyraState[2 *((1 * threads + thread) * 4 + 1) + player] = state[1];
if (LOCAL_LINEAR == 1) lyraState[2 *((2 * threads + thread) * 4 + 1) + player] = state[2];
if (LOCAL_LINEAR == 1) lyraState[2 *((3 * threads + thread) * 4 + 1) + player] = state[3];
if (LOCAL_LINEAR == 2) lyraState[2 *((0 * threads + thread) * 4 + 2) + player] = state[0];
if (LOCAL_LINEAR == 2) lyraState[2 *((1 * threads + thread) * 4 + 2) + player] = state[1];
if (LOCAL_LINEAR == 2) lyraState[2 *((2 * threads + thread) * 4 + 2) + player] = state[2];
if (LOCAL_LINEAR == 2) lyraState[2 *((3 * threads + thread) * 4 + 2) + player] = state[3];
if (LOCAL_LINEAR == 3) lyraState[2 *((0 * threads + thread) * 4 + 3) + player] = state[0];
if (LOCAL_LINEAR == 3) lyraState[2 *((1 * threads + thread) * 4 + 3) + player] = state[1];
if (LOCAL_LINEAR == 3) lyraState[2 *((2 * threads + thread) * 4 + 3) + player] = state[2];
if (LOCAL_LINEAR == 3) lyraState[2 *((3 * threads + thread) * 4 + 3) + player] = state[3];

barrier(CLK_GLOBAL_MEM_FENCE);
}

// lyra2 p3

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search4(__global uint* hashes, __global uchar* sharedDataBuf)
__kernel void search4(__global uint* hashes, __global uchar* sharedDataBuf, uint threads)
{
int gid = get_global_id(0);

uint thread = gid - get_global_offset(0);
__global hash2_t *hash = (__global hash2_t *)(hashes + (8* (gid-get_global_offset(0))));
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(0))));
__global ulong4 *lyraState = (__global ulong4 *)(sharedDataBuf);

ulong ttr;

ulong2 state[8];
// 1. load lyra State
state[0] = lyraState->hl16[0];
state[1] = lyraState->hl16[1];
state[2] = lyraState->hl16[2];
state[3] = lyraState->hl16[3];
state[4] = lyraState->hl16[4];
state[5] = lyraState->hl16[5];
state[6] = lyraState->hl16[6];
state[7] = lyraState->hl16[7];
((ulong4 *)state)[0] = lyraState[threads * 0 + thread];
((ulong4 *)state)[1] = lyraState[threads * 1 + thread];
((ulong4 *)state)[2] = lyraState[threads * 2 + thread];
((ulong4 *)state)[3] = lyraState[threads * 3 + thread];

// 2. rounds
for (int i = 0; i < 12; ++i)
Expand Down Expand Up @@ -502,12 +493,13 @@ __kernel void search5(__global hash2_t* hashes)
/// lyra2 p1

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search6(__global uint* hashes, __global uchar* sharedDataBuf)
__kernel void search6(__global uint* hashes, __global uchar* sharedDataBuf, uint threads)
{
int gid = get_global_id(0);

uint thread = gid - get_global_offset(0);
__global hash2_t *hash = (__global hash2_t *)(hashes + (8* (gid-get_global_offset(0))));
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(0))));
__global ulong4 *lyraState = (__global ulong4 *)(sharedDataBuf);

ulong ttr;

Expand All @@ -531,18 +523,10 @@ __kernel void search6(__global uint* hashes, __global uchar* sharedDataBuf)
roundLyra(state);
}

// state0
lyraState->hl16[0] = state[0];
lyraState->hl16[1] = state[1];
// state1
lyraState->hl16[2] = state[2];
lyraState->hl16[3] = state[3];
// state2
lyraState->hl16[4] = state[4];
lyraState->hl16[5] = state[5];
// state3
lyraState->hl16[6] = state[6];
lyraState->hl16[7] = state[7];
lyraState[threads * 0 + thread] = ((ulong4 *)state)[0];
lyraState[threads * 1 + thread] = ((ulong4 *)state)[1];
lyraState[threads * 2 + thread] = ((ulong4 *)state)[2];
lyraState[threads * 3 + thread] = ((ulong4 *)state)[3];

barrier(CLK_GLOBAL_MEM_FENCE);
}
Expand All @@ -553,10 +537,11 @@ __attribute__((amdgpu_waves_per_eu(1,1)))
__attribute__((amdgpu_num_vgpr(256)))
__attribute__((amdgpu_num_sgpr(200)))
__attribute__((reqd_work_group_size(4, 2, 16)))
__kernel void search7(__global uchar* sharedDataBuf)
__kernel void search7(__global uchar* sharedDataBuf, uint threads)
{
uint gid = get_global_id(2);
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(2))));
uint thread = gid-get_global_offset(2);
__global uint *lyraState = (__global uint *)(sharedDataBuf);

uint notepad[192];

Expand All @@ -579,22 +564,22 @@ __kernel void search7(__global uchar* sharedDataBuf)

//-------------------------------------
// Load Lyra state
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState->h4[2 * 0 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState->h4[2 * 1 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState->h4[2 * 2 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 0 + player]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 1 + player]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 2 + player]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState->h4[2 * 3 + 2 * 4 * 3 + player]));
if (LOCAL_LINEAR == 0) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 0) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 0) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 0) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 0) + player]));
if (LOCAL_LINEAR == 1) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 1) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 1) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 1) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 1) + player]));
if (LOCAL_LINEAR == 2) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 2) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 2) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 2) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 2) + player]));
if (LOCAL_LINEAR == 3) state[0] = ((uint)(lyraState[2 *((0 * threads + thread) * 4 + 3) + player]));
if (LOCAL_LINEAR == 3) state[1] = ((uint)(lyraState[2 *((1 * threads + thread) * 4 + 3) + player]));
if (LOCAL_LINEAR == 3) state[2] = ((uint)(lyraState[2 *((2 * threads + thread) * 4 + 3) + player]));
if (LOCAL_LINEAR == 3) state[3] = ((uint)(lyraState[2 *((3 * threads + thread) * 4 + 3) + player]));

write_state(notepad, state, 0, 7);
round_lyra_4way_sw(state);
Expand Down Expand Up @@ -664,48 +649,45 @@ __kernel void search7(__global uchar* sharedDataBuf)

//-------------------------------------
// save lyra state
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 0) lyraState->h4[2 * 0 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 1) lyraState->h4[2 * 1 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 2) lyraState->h4[2 * 2 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 0 + player] = state[0];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 1 + player] = state[1];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 2 + player] = state[2];
if (LOCAL_LINEAR == 3) lyraState->h4[2 * 3 + 2 * 4 * 3 + player] = state[3];
if (LOCAL_LINEAR == 0) lyraState[2 *((0 * threads + thread) * 4 + 0) + player] = state[0];
if (LOCAL_LINEAR == 0) lyraState[2 *((1 * threads + thread) * 4 + 0) + player] = state[1];
if (LOCAL_LINEAR == 0) lyraState[2 *((2 * threads + thread) * 4 + 0) + player] = state[2];
if (LOCAL_LINEAR == 0) lyraState[2 *((3 * threads + thread) * 4 + 0) + player] = state[3];
if (LOCAL_LINEAR == 1) lyraState[2 *((0 * threads + thread) * 4 + 1) + player] = state[0];
if (LOCAL_LINEAR == 1) lyraState[2 *((1 * threads + thread) * 4 + 1) + player] = state[1];
if (LOCAL_LINEAR == 1) lyraState[2 *((2 * threads + thread) * 4 + 1) + player] = state[2];
if (LOCAL_LINEAR == 1) lyraState[2 *((3 * threads + thread) * 4 + 1) + player] = state[3];
if (LOCAL_LINEAR == 2) lyraState[2 *((0 * threads + thread) * 4 + 2) + player] = state[0];
if (LOCAL_LINEAR == 2) lyraState[2 *((1 * threads + thread) * 4 + 2) + player] = state[1];
if (LOCAL_LINEAR == 2) lyraState[2 *((2 * threads + thread) * 4 + 2) + player] = state[2];
if (LOCAL_LINEAR == 2) lyraState[2 *((3 * threads + thread) * 4 + 2) + player] = state[3];
if (LOCAL_LINEAR == 3) lyraState[2 *((0 * threads + thread) * 4 + 3) + player] = state[0];
if (LOCAL_LINEAR == 3) lyraState[2 *((1 * threads + thread) * 4 + 3) + player] = state[1];
if (LOCAL_LINEAR == 3) lyraState[2 *((2 * threads + thread) * 4 + 3) + player] = state[2];
if (LOCAL_LINEAR == 3) lyraState[2 *((3 * threads + thread) * 4 + 3) + player] = state[3];

barrier(CLK_GLOBAL_MEM_FENCE);
}

// lyra2 p3

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search8(__global uint* hashes, __global uchar* sharedDataBuf)
__kernel void search8(__global uint* hashes, __global uchar* sharedDataBuf, uint threads)
{
int gid = get_global_id(0);

uint thread = gid - get_global_offset(0);
__global hash2_t *hash = (__global hash2_t *)(hashes + (8* (gid-get_global_offset(0))));
__global lyraState_t *lyraState = (__global lyraState_t *)(sharedDataBuf + ((8 * 4 * 4) * (gid-get_global_offset(0))));
__global ulong4 *lyraState = (__global ulong4 *)(sharedDataBuf);

ulong ttr;

ulong2 state[8];
// 1. load lyra State
state[0] = lyraState->hl16[0];
state[1] = lyraState->hl16[1];
state[2] = lyraState->hl16[2];
state[3] = lyraState->hl16[3];
state[4] = lyraState->hl16[4];
state[5] = lyraState->hl16[5];
state[6] = lyraState->hl16[6];
state[7] = lyraState->hl16[7];
((ulong4 *)state)[0] = lyraState[threads * 0 + thread];
((ulong4 *)state)[1] = lyraState[threads * 1 + thread];
((ulong4 *)state)[2] = lyraState[threads * 2 + thread];
((ulong4 *)state)[3] = lyraState[threads * 3 + thread];

// 2. rounds
for (int i = 0; i < 12; ++i)
Expand Down

0 comments on commit 7ea5c4c

Please sign in to comment.