fix linux compilation (untested) kernel optimization

firoorg · Jan 27, 2019 · b7b07c8 · b7b07c8
1 parent 9a64b57
commit b7b07c8
Show file tree

Hide file tree

Showing 13 changed files with 2,455 additions and 107 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -87,5 +87,11 @@ sgminer_SOURCES += algorithm/lyra2Z.c algorithm/lyra2Z.h
 sgminer_SOURCES += algorithm/lyra2h.c algorithm/lyra2h.h
 sgminer_SOURCES += gbt-util.c gbt-util.h
 
+sgminer_SOURCES += algorithm/argon2d/argon2ref/blake2/blake2b.c  algorithm/argon2d/argon2ref/argon2.c  algorithm/argon2d/argon2ref/core.c algorithm/argon2d/argon2ref/opt.c  algorithm/argon2d/argon2ref/thread.c algorithm/argon2d/argon2ref/encoding.c algorithm/argon2d/argon2ref/argon2.h
+sgminer_SOURCES += algorithm/argon2d/argon2d.c algorithm/argon2d/argon2d.h
+sgminer_SOURCES += mtp_argon2ref/mtp_argon2.c mtp_argon2ref/mtp_blake2ba.c mtp_argon2ref/mtp_core.c mtp_argon2ref/mtp_encoding.c mtp_argon2ref/mtp_ref.c mtp_argon2ref/mtp_thread.c mtp_argon2ref/mtp_argon2.h mtp_argon2ref/mtp_blake2-impl.h mtp_argon2ref/mtp_blake2.h mtp_argon2ref/mtp_blake2b-load-sse2.h mtp_argon2ref/mtp_blake2b-load-sse41.h mtp_argon2ref/mtp_blake2b-round.h mtp_argon2ref/mtp_blamka-round-opt.h mtp_argon2ref/mtp_blamka-round-ref.h mtp_argon2ref/mtp_core.h mtp_argon2ref/mtp_encoding.h mtp_argon2ref/mtp_thread.h
+sgminer_SOURCES += algorithm/mtp_algo.c algorithm/mtp_algo.h
+sgminer_SOURCES += merkletree/merkle-tree.cpp merkletree/mtp.cpp merkletree/merkle-tree.hpp merkletree/mtp.h
+
 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl
 
diff --git a/algorithm.c b/algorithm.c
@@ -55,6 +55,7 @@
 
 const char *algorithm_type_str[] = {
   "mtp",
+  "mtp_vega",
   "Unknown",
   "Credits",
   "Scrypt",
@@ -1708,6 +1709,7 @@ static algorithm_settings_t algos[] = {
   { "lyra2Z"   , ALGO_LYRA2Z   , "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, lyra2Z_regenhash   , precalc_hash_blake256, queue_lyra2z_kernel   , gen_hash, NULL },
   { "lyra2h"   , ALGO_LYRA2H   , "", 1, 256, 256, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, lyra2h_regenhash   , precalc_hash_blake256, queue_lyra2h_kernel   , gen_hash, NULL },
   { "mtp"   , ALGO_MTP   , "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, mtp_regenhash   , NULL, queue_mtp_kernel   , gen_hash, NULL },
+  { "mtp_vega"   , ALGO_MTP   , "", 1, 1, 1, 0, 0, 0xFF, 0xFFFFULL, 0x0000ffffUL, 1, 0,0, mtp_regenhash   , NULL, queue_mtp_kernel   , gen_hash, NULL },
 
   // kernels starting from this will have difficulty calculated by using fuguecoin algorithm
 #define A_FUGUE(a, b, c) \

diff --git a/configure.ac b/configure.ac
@@ -255,7 +255,7 @@ AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
 AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue])
 
 AC_CONFIG_SUBDIRS([submodules/jansson])
-JANSSON_LIBS="submodules/jansson/src/.libs/libjansson.a"
+JANSSON_LIBS="submodules/jansson/src/.libs/libbosjansson.a"
 
 PKG_PROG_PKG_CONFIG()
 

diff --git a/driver-opencl.c b/driver-opencl.c
@@ -1324,7 +1324,7 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 
 static bool opencl_thread_init(struct thr_info *thr)
 {
-applog(LOG_DEBUG, "****************coming to opencl_thread_init *******************");
+
   const int thr_id = thr->id;
   struct cgpu_info *gpu = thr->cgpu;
   struct opencl_thread_data *thrdata;
@@ -1346,7 +1346,7 @@ applog(LOG_DEBUG, "****************coming to opencl_thread_init ****************
     applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
     return false;
   }
-
+if (clState != NULL)
   status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
     BUFFERSIZE, blank_res, 0, NULL, NULL);
   if (unlikely(status != CL_SUCCESS)) {
@@ -1359,7 +1359,7 @@ applog(LOG_DEBUG, "****************coming to opencl_thread_init ****************
   gpu->status = LIFE_WELL;
 
   gpu->device_last_well = time(NULL);
-  applog(LOG_DEBUG, "****************leave opencl_thread_init *******************");
+
   return true;
 }
 

diff --git a/kernel/mtp.cl b/kernel/mtp.cl
@@ -3,8 +3,18 @@
 * MTP
 * djm34 2017-2018
 * krnlx 2018
+* djm34 2019
 **/
 
+#define NVIDIA_GPU 0
+#ifdef cl_nv_pragma_unroll
+#define NVIDIA
+#undef NVIDIA_GPU
+#define NVIDIA_GPU 1
+#endif
+
+
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
 typedef unsigned long uint64_t;
 typedef uint uint32_t;
 //typedef unsigned char uint8_t;
@@ -115,37 +125,19 @@ __constant static const uchar blake2b_sigma[12][16] =
 #define SPH_ROTL64(x, n) rotate(as_ulong(x), (n) & 0xFFFFFFFFFFFFFFFFUL)
 #define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n)))
 
-/*__device__ __forceinline__*/
-static inline uint64_t ROTR64X(const uint64_t value, const int offset) {
-// return rotate(value, (ulong)(64 - offset));
-	return SPH_ROTR64(value, offset);
-/*
-	uint2 result;
-	const uint2 tmp = vectorize(value);
 
-	if (offset == 8) {
-		result.x = __byte_perm(tmp.x, tmp.y, 0x4321);
-		result.y = __byte_perm(tmp.y, tmp.x, 0x4321);
-	}
-	else if (offset == 16) {
-		result.x = __byte_perm(tmp.x, tmp.y, 0x5432);
-		result.y = __byte_perm(tmp.y, tmp.x, 0x5432);
-	}
-	else if (offset == 24) {
-		result.x = __byte_perm(tmp.x, tmp.y, 0x6543);
-		result.y = __byte_perm(tmp.y, tmp.x, 0x6543);
-	}
-	else if (offset < 32) {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
-	}
-	else {
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.y), "r"(tmp.x), "r"(offset));
-		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.x), "r"(tmp.y), "r"(offset));
-	}
-	return devectorize(result);
-*/
+#if NVIDIA_GPU == 1
+static inline uint64_t ROTR64X(const uint64_t x2, const int y) {
+ return rotate(x2, (ulong)(64 - y));
 }
+#else 
+static inline uint64_t ROTR64X(const uint64_t x2, const int y) {
+	uint2 x = as_uint2(x2);
+	if (y < 32) return(as_ulong(amd_bitalign(x.s10, x, y)));
+	else return(as_ulong(amd_bitalign(x, x.s10, (y - 32))));
+}
+#endif
+
 
 static inline uint2 ROR2(uint2 v, unsigned a) {
 	uint2 result;
@@ -188,20 +180,6 @@ static inline uint64_t eorswap64(uint64_t u, uint64_t v)
 		return ROTR64X(u^v, 32);
 }
 
-/*
-#define GS(a,b,c,d,e,f) \
-{ \
-v[a] +=   v[b] + m[e]; \
-v[d] = eorswap32(v[d] , v[a]); \
-v[c] += v[d]; \
-v[b] = ROR2(v[b] ^ v[c], 24); \
-v[a] += v[b] + m[f]; \
-v[d] = ROR16(v[d] ^ v[a]); \
-v[c] += v[d]; \
-v[b] = ROR2(v[b] ^ v[c], 63); \
-}
-*/
-
 
 #define GS(a,b,c,d,e,f) \
    { \
@@ -621,16 +599,15 @@ __attribute__((reqd_work_group_size(TPB_MTP, 1, 1)))
 __kernel void mtp_yloop(__global unsigned int* pData, __global const uint4  * __restrict__ DBlock, __global const uint4  * __restrict__ DBlock2,
 __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce,  uint pTarget)
 {
-//if (get_global_id(0)==0)
-//		printf("entering mtp_yloop\n");
+
 	uint32_t NonceNumber = 1;  // old
 	uint32_t ThreadNumber = 1;
 	uint32_t event_thread = get_global_id(0) - get_global_offset(0); //thread / ThreadNumber;
 
 	uint32_t NonceIterator = get_global_id(0);
 	int lane = get_local_id(0) % DIV;
 	int warp = get_local_id(0) / DIV;;//warp_id();
-//	__local  ulong2 far[TPB_MTP/ DIV][256 * (LEN + SHR_OFF)];
+ 
 		ulong2 FarReg[8]; 
 	 uint32_t farIndex;
 	const uint32_t half_memcost = 2 * 1024 * 1024;
@@ -686,7 +663,7 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce,  uint
 
 			}
 			farIndex = YLocal.s0 & 0x3FFFFF;
-			barrier(CLK_LOCAL_MEM_FENCE);
+
 
 			ulong8 DataChunk[2];
 			uint32_t len = 0;
@@ -719,19 +696,19 @@ __global uint4 * Elements, __global uint32_t * __restrict__ SmallestNonce,  uint
 						__global ulong2 *farP = (farIndex<half_memcost)?  (__global ulong2*)&GBlock[farIndex * 64 + 0 + 8 * i + 0]
 																				 : (__global ulong2*)&GBlock2[(farIndex - half_memcost) * 64 + 0 + 8 * i + 0];
 
-					//	far[warp][t + (LEN + SHR_OFF) * (lane)] = (last) ? (ulong2)(0, 0) : farP[t];
+
 						FarReg[t] = (last) ? (ulong2)(0, 0) : farP[t];
 					}
 
-					barrier(CLK_LOCAL_MEM_FENCE);
+
 				}
 
 				#pragma unroll
 				for (int t = 0; t<6; t++) {
 					ulong2 *D = (ulong2*)DataChunk;
 					D[t + 2] = (FARLOAD(t));
 				}
-				barrier(CLK_LOCAL_MEM_FENCE);
+
 				((uint16*)DataChunk)[0].lo = YLocal;
 
 				//	uint16 DataTmp2;