Skip to content

Commit

Permalink
fix linux compilation (untested) kernel optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
djm34 committed Jan 27, 2019
1 parent 9a64b57 commit b7b07c8
Show file tree
Hide file tree
Showing 13 changed files with 2,455 additions and 107 deletions.
6 changes: 6 additions & 0 deletions Makefile.am
Expand Up @@ -87,5 +87,11 @@ sgminer_SOURCES += algorithm/lyra2Z.c algorithm/lyra2Z.h
sgminer_SOURCES += algorithm/lyra2h.c algorithm/lyra2h.h
sgminer_SOURCES += gbt-util.c gbt-util.h

sgminer_SOURCES += algorithm/argon2d/argon2ref/blake2/blake2b.c algorithm/argon2d/argon2ref/argon2.c algorithm/argon2d/argon2ref/core.c algorithm/argon2d/argon2ref/opt.c algorithm/argon2d/argon2ref/thread.c algorithm/argon2d/argon2ref/encoding.c algorithm/argon2d/argon2ref/argon2.h
sgminer_SOURCES += algorithm/argon2d/argon2d.c algorithm/argon2d/argon2d.h
sgminer_SOURCES += mtp_argon2ref/mtp_argon2.c mtp_argon2ref/mtp_blake2ba.c mtp_argon2ref/mtp_core.c mtp_argon2ref/mtp_encoding.c mtp_argon2ref/mtp_ref.c mtp_argon2ref/mtp_thread.c mtp_argon2ref/mtp_argon2.h mtp_argon2ref/mtp_blake2-impl.h mtp_argon2ref/mtp_blake2.h mtp_argon2ref/mtp_blake2b-load-sse2.h mtp_argon2ref/mtp_blake2b-load-sse41.h mtp_argon2ref/mtp_blake2b-round.h mtp_argon2ref/mtp_blamka-round-opt.h mtp_argon2ref/mtp_blamka-round-ref.h mtp_argon2ref/mtp_core.h mtp_argon2ref/mtp_encoding.h mtp_argon2ref/mtp_thread.h
sgminer_SOURCES += algorithm/mtp_algo.c algorithm/mtp_algo.h
sgminer_SOURCES += merkletree/merkle-tree.cpp merkletree/mtp.cpp merkletree/merkle-tree.hpp merkletree/mtp.h

bin_SCRIPTS = $(top_srcdir)/kernel/*.cl

2 changes: 2 additions & 0 deletions algorithm.c
Expand Up @@ -55,6 +55,7 @@

const char *algorithm_type_str[] = {
"mtp",
"mtp_vega",
"Unknown",
"Credits",
"Scrypt",
Expand Down Expand Up @@ -1708,6 +1709,7 @@ static algorithm_settings_t algos[] = {
{ "lyra2Z" , ALGO_LYRA2Z , "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, lyra2Z_regenhash , precalc_hash_blake256, queue_lyra2z_kernel , gen_hash, NULL },
{ "lyra2h" , ALGO_LYRA2H , "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, lyra2h_regenhash , precalc_hash_blake256, queue_lyra2h_kernel , gen_hash, NULL },
{ "mtp" , ALGO_MTP , "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, mtp_regenhash , NULL, queue_mtp_kernel , gen_hash, NULL },
{ "mtp_vega" , ALGO_MTP , "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, mtp_regenhash , NULL, queue_mtp_kernel , gen_hash, NULL },

// kernels starting from this will have difficulty calculated by using fuguecoin algorithm
#define A_FUGUE(a, b, c) \
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Expand Up @@ -255,7 +255,7 @@ AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue])

AC_CONFIG_SUBDIRS([submodules/jansson])
JANSSON_LIBS="submodules/jansson/src/.libs/libjansson.a"
JANSSON_LIBS="submodules/jansson/src/.libs/libbosjansson.a"

PKG_PROG_PKG_CONFIG()

Expand Down
6 changes: 3 additions & 3 deletions driver-opencl.c
Expand Up @@ -1324,7 +1324,7 @@ static bool opencl_thread_prepare(struct thr_info *thr)

static bool opencl_thread_init(struct thr_info *thr)
{
applog(LOG_DEBUG, "****************coming to opencl_thread_init *******************");

const int thr_id = thr->id;
struct cgpu_info *gpu = thr->cgpu;
struct opencl_thread_data *thrdata;
Expand All @@ -1346,7 +1346,7 @@ applog(LOG_DEBUG, "****************coming to opencl_thread_init ****************
applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
return false;
}

if (clState != NULL)
status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
BUFFERSIZE, blank_res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
Expand All @@ -1359,7 +1359,7 @@ applog(LOG_DEBUG, "****************coming to opencl_thread_init ****************
gpu->status = LIFE_WELL;

gpu->device_last_well = time(NULL);
applog(LOG_DEBUG, "****************leave opencl_thread_init *******************");

return true;
}

Expand Down
77 changes: 27 additions & 50 deletions kernel/mtp.cl
Expand Up @@ -3,8 +3,18 @@
* MTP
* djm34 2017-2018
* krnlx 2018
* djm34 2019
**/

#define NVIDIA_GPU 0
#ifdef cl_nv_pragma_unroll
#define NVIDIA
#undef NVIDIA_GPU
#define NVIDIA_GPU 1
#endif


#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
typedef unsigned long uint64_t;
typedef uint uint32_t;
//typedef unsigned char uint8_t;
Expand Down Expand Up @@ -115,37 +125,19 @@ __constant static const uchar blake2b_sigma[12][16] =
#define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))

/*__device__ __forceinline__*/
static inline uint64_t ROTR64X(const uint64_t value, const int offset) {
// return rotate(value, (ulong)(64 - offset));
return SPH_ROTR64(value, offset);
/*
uint2 result;
const uint2 tmp = vectorize(value);

if (offset == 8) {
result.x = __byte_perm(tmp.x, tmp.y, 0x4321);
result.y = __byte_perm(tmp.y, tmp.x, 0x4321);
}
else if (offset == 16) {
result.x = __byte_perm(tmp.x, tmp.y, 0x5432);
result.y = __byte_perm(tmp.y, tmp.x, 0x5432);
}
else if (offset == 24) {
result.x = __byte_perm(tmp.x, tmp.y, 0x6543);
result.y = __byte_perm(tmp.y, tmp.x, 0x6543);
}
else if (offset < 32) {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
}
else {
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
}
return devectorize(result);
*/
#if NVIDIA_GPU == 1
static inline uint64_t ROTR64X(const uint64_t x2, const int y) {
return rotate(x2, (ulong)(64 - y));
}
#else
static inline uint64_t ROTR64X(const uint64_t x2, const int y) {
uint2 x = as_uint2(x2);
if (y < 32) return(as_ulong(amd_bitalign(x.s10, x, y)));
else return(as_ulong(amd_bitalign(x, x.s10, (y - 32))));
}
#endif


static inline uint2 ROR2(uint2 v, unsigned a) {
uint2 result;
Expand Down Expand Up @@ -188,20 +180,6 @@ static inline uint64_t eorswap64(uint64_t u, uint64_t v)
return ROTR64X(u^v, 32);
}

/*
#define GS(a,b,c,d,e,f) \
{ \
v[a] += v[b] + m[e]; \
v[d] = eorswap32(v[d] , v[a]); \
v[c] += v[d]; \
v[b] = ROR2(v[b] ^ v[c], 24); \
v[a] += v[b] + m[f]; \
v[d] = ROR16(v[d] ^ v[a]); \
v[c] += v[d]; \
v[b] = ROR2(v[b] ^ v[c], 63); \
}
*/


#define GS(a,b,c,d,e,f) \
{ \
Expand Down Expand Up @@ -621,16 +599,15 @@ __attribute__((reqd_work_group_size(TPB_MTP, 1, 1)))
__kernel void mtp_yloop(__global unsigned int* pData, __global const uint4 * __restrict__ DBlock, __global const uint4 * __restrict__ DBlock2,
__global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint pTarget)
{
//if (get_global_id(0)==0)
// printf("entering mtp_yloop\n");

uint32_t NonceNumber = 1; // old
uint32_t ThreadNumber = 1;
uint32_t event_thread = get_global_id(0) - get_global_offset(0); //thread / ThreadNumber;

uint32_t NonceIterator = get_global_id(0);
int lane = get_local_id(0) % DIV;
int warp = get_local_id(0) / DIV;;//warp_id();
// __local ulong2 far[TPB_MTP/ DIV][256 * (LEN + SHR_OFF)];
ulong2 FarReg[8];
uint32_t farIndex;
const uint32_t half_memcost = 2 * 1024 * 1024;
Expand Down Expand Up @@ -686,7 +663,7 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint

}
farIndex = YLocal.s0 & 0x3FFFFF;
barrier(CLK_LOCAL_MEM_FENCE);


ulong8 DataChunk[2];
uint32_t len = 0;
Expand Down Expand Up @@ -719,19 +696,19 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce, uint
__global ulong2 *farP = (farIndex<half_memcost)? (__global ulong2*)&GBlock[farIndex * 64 + 0 + 8 * i + 0]
: (__global ulong2*)&GBlock2[(farIndex - half_memcost) * 64 + 0 + 8 * i + 0];

// far[warp][t + (LEN + SHR_OFF) * (lane)] = (last) ? (ulong2)(0, 0) : farP[t];

FarReg[t] = (last) ? (ulong2)(0, 0) : farP[t];
}

barrier(CLK_LOCAL_MEM_FENCE);

}

#pragma unroll
for (int t = 0; t<6; t++) {
ulong2 *D = (ulong2*)DataChunk;
D[t + 2] = (FARLOAD(t));
}
barrier(CLK_LOCAL_MEM_FENCE);

((uint16*)DataChunk)[0].lo = YLocal;

// uint16 DataTmp2;
Expand Down

0 comments on commit b7b07c8

Please sign in to comment.