resolve compatibility issues with linux and vector initializations

make_uint16 works for both linux and windows (at the same time), previous logic were mutually exclusive
djm34 · Aug 2, 2015 · 8364fbd · 8364fbd
1 parent da9aa83
commit 8364fbd
Showing 1 changed file with 47 additions and 146 deletions.
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
@@ -26,127 +26,25 @@ __device__ vectype  *DMatrix;
 
 
 static __device__ __forceinline__ void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
-{
-
-	a += b; d ^= a; d = SWAPDWORDS2(d);
-	c += d; b ^= c; b = ROR24(b);
-	a += b; d ^= a; d = ROR16(d);
-	c += d; b ^= c; b = ROR2(b, 63);
-
-}
-
-static __device__ __forceinline__ void Gfunc_v35_p1(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
 {
 
 	a += b; d ^= a; d = SWAPDWORDS2(d);
 	c += d; b ^= c; b = ROR24(b);
-}
-
-static __device__ __forceinline__ void Gfunc_v35_p2(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
-{
 	a += b; d ^= a; d = ROR16(d);
 	c += d; b ^= c; b = ROR2(b, 63);
-}
-
-
-static __device__ __forceinline__ void Gfunc_v35(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
-{
-
-	a += b; d ^= a; d = ROTR64(d, 32);
-	c += d; b ^= c; b = ROTR64(b, 24);
-	a += b; d ^= a; d = ROTR64(d, 16);
-	c += d; b ^= c; b = ROTR64(b, 63);
 
 }
 
-static __device__ __forceinline__ void Gfunc_v35_p1(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
+static __device__ __forceinline__ void Gfunc_v35(unsigned long long & a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
 {
 
 	a += b; d ^= a; d = ROTR64(d, 32);
 	c += d; b ^= c; b = ROTR64(b, 24);
-}
-
-static __device__ __forceinline__ void Gfunc_v35_p2(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
-{
-
 	a += b; d ^= a; d = ROTR64(d, 16);
 	c += d; b ^= c; b = ROTR64(b, 63);
-}
-
-#define RORa(d) make_uint28(SWAPDWORDS2(d.x),SWAPDWORDS2(d.y),SWAPDWORDS2(d.z),SWAPDWORDS2(d.w))
-#define RORb(d) make_uint28(ROR24(d.x),ROR24(d.y),ROR24(d.z),ROR24(d.w))
-#define RORc(d) make_uint28(ROR16(d.x),ROR16(d.y),ROR16(d.z),ROR16(d.w))
-#define RORd(d) make_uint28(ROR2(d.x,63),ROR2(d.y,63),ROR2(d.z,63),ROR2(d.w,63))
-
 
-static __device__ __forceinline__ ulonglong4 make_vectype(const uint64_t  a, const uint64_t b, const uint64_t c, const uint64_t d)
-{
-	return make_ulonglong4(a, b, c, d);	
-}
-
-static __device__ __forceinline__ uint28 make_vectype(const uint2  a, const uint2 b, const uint2 c, const uint2 d)
-{
-	return make_uint28(a, b, c, d);
-}
-
-
-static __device__ __forceinline__ void Gfunc_v4(ulonglong4 & a, ulonglong4 &b, ulonglong4 &c, ulonglong4 &d)
-{
-#define ROR4(d,n) make_ulonglong4(ROTR64(d.x,n),ROTR64(d.y,n),ROTR64(d.z,n),ROTR64(d.w,n))
-	a += b; d ^= a; d = ROR4(d, 32);
-	c += d; b ^= c; b = ROR4(b, 24);
-	a += b; d ^= a; d = ROR4(d, 16);
-	c += d; b ^= c; b = ROR4(b, 63);
-#undef ROR4
-}
-
-static __device__ __forceinline__ void Gfunc_v4(uint28 & a, uint28 &b, uint28 &c, uint28 &d)
-{
-#define ROR4(d,n) make_uint28(ROR2(d.x,n),ROR2(d.y,n),ROR2(d.z,n),ROR2(d.w,n))
-	a += b; d ^= a; d = RORa(d);
-	c += d; b ^= c; b = RORb(b);
-	a += b; d ^= a; d = RORc(d);
-	c += d; b ^= c; b = RORd(b);
-#undef ROR4
-}
-
-
-
-static __device__ __forceinline__ void round_lyra64(uint64_t* s)  
-{  
-	Gfunc_v35(s[0], s[4], s[8], s[12]);  
-	Gfunc_v35(s[1], s[5], s[9], s[13]);  
-	Gfunc_v35(s[2], s[6], s[10], s[14]);  
-	Gfunc_v35(s[3], s[7], s[11], s[15]);  
-	Gfunc_v35(s[0], s[5], s[10], s[15]);   
-	Gfunc_v35(s[1], s[6], s[11], s[12]);  
-	Gfunc_v35(s[2], s[7], s[8], s[13]);  
-	Gfunc_v35(s[3], s[4], s[9], s[14]);  
 }
 
-static __device__ __forceinline__ void round_lyra_v35(uint2_16* s)
-{
-	    Gfunc_v35(s[0].s0, s[0].s4, s[0].s8, s[0].sc);  
-		Gfunc_v35(s[0].s1, s[0].s5, s[0].s9, s[0].sd);  
-		Gfunc_v35(s[0].s2, s[0].s6, s[0].sa, s[0].se);  
-		Gfunc_v35(s[0].s3, s[0].s7, s[0].sb, s[0].sf);  
-		Gfunc_v35(s[0].s0, s[0].s5, s[0].sa, s[0].sf);  
-		Gfunc_v35(s[0].s1, s[0].s6, s[0].sb, s[0].sc);  
-		Gfunc_v35(s[0].s2, s[0].s7, s[0].s8, s[0].sd);  
-		Gfunc_v35(s[0].s3, s[0].s4, s[0].s9, s[0].se);  
-}
-
-static __device__ __forceinline__ void round_lyra_v35(uint2* s)
-{
-	Gfunc_v35(s[0], s[4], s[8], s[12]);
-	Gfunc_v35(s[1], s[5], s[9], s[13]);
-	Gfunc_v35(s[2], s[6], s[10], s[14]);
-	Gfunc_v35(s[3], s[7], s[11], s[15]);
-	Gfunc_v35(s[0], s[5], s[10], s[15]);
-	Gfunc_v35(s[1], s[6], s[11], s[12]);
-	Gfunc_v35(s[2], s[7], s[8], s[13]);
-	Gfunc_v35(s[3], s[4], s[9], s[14]);
-}
 
 static __device__ __forceinline__ void round_lyra_v35(vectype* s)
 {
@@ -162,6 +60,8 @@ static __device__ __forceinline__ void round_lyra_v35(vectype* s)
 	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
 
 }
+
+
 
 static __device__ __forceinline__ void reduceDuplex(vectype state[4], uint32_t thread)
 {
@@ -467,26 +367,26 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu
 	uint28 padding[2];
 	if (threadIdx.x == 0) {
 
-		((uint2_8*)blake2b_IV)[0] = {
-			{ 0xf3bcc908, 0x6a09e667 },
-			{ 0x84caa73b, 0xbb67ae85 },
-			{ 0xfe94f82b, 0x3c6ef372 },
-			{ 0x5f1d36f1, 0xa54ff53a },
-			{ 0xade682d1, 0x510e527f },
-			{ 0x2b3e6c1f, 0x9b05688c },
-			{ 0xfb41bd6b, 0x1f83d9ab },
-			{ 0x137e2179, 0x5be0cd19 }
-		};
-		((uint2_8*)padding)[0] = {
-			{ 0x20, 0x0 },
-			{ 0x20, 0x0 },
-			{ 0x20, 0x0 },
-			{ 0x01, 0x0 },
-			{ 0x04, 0x0 },
-			{ 0x04, 0x0 },
-			{ 0x80, 0x0 },
-			{ 0x0, 0x01000000 }
-		};
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			 0xf3bcc908, 0x6a09e667 ,
+			 0x84caa73b, 0xbb67ae85 ,
+			 0xfe94f82b, 0x3c6ef372 ,
+			 0x5f1d36f1, 0xa54ff53a ,
+			 0xade682d1, 0x510e527f ,
+			 0x2b3e6c1f, 0x9b05688c ,
+			 0xfb41bd6b, 0x1f83d9ab ,
+			 0x137e2179, 0x5be0cd19 
+		);
+		((uint16*)padding)[0] = make_uint16( 
+			 0x20, 0x0 ,
+			 0x20, 0x0 ,
+			 0x20, 0x0 ,
+			 0x01, 0x0 ,
+			 0x04, 0x0 ,
+			 0x04, 0x0 ,
+			 0x80, 0x0 ,
+			 0x0, 0x01000000 
+		);
 
 	}
 
@@ -571,29 +471,30 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa
 
 	   uint28 blake2b_IV[2];
 	   uint28 padding[2];
-	   if (threadIdx.x == 0) {
-
-		   ((uint2_8*)blake2b_IV)[0] = {
-			   { 0xf3bcc908, 0x6a09e667 },
-			   { 0x84caa73b, 0xbb67ae85 },
-			   { 0xfe94f82b, 0x3c6ef372 },
-			   { 0x5f1d36f1, 0xa54ff53a },
-			   { 0xade682d1, 0x510e527f },
-			   { 0x2b3e6c1f, 0x9b05688c },
-			   { 0xfb41bd6b, 0x1f83d9ab },
-			   { 0x137e2179, 0x5be0cd19 }
-		   };
-		   ((uint2_8*)padding)[0] = {
-			   { 0x20, 0x0 },
-			   { 0x20, 0x0 },
-			   { 0x20, 0x0 },
-			   { 0x01, 0x0 },
-			   { 0x04, 0x0 },
-			   { 0x04, 0x0 },
-			   { 0x80, 0x0 },
-			   { 0x0, 0x01000000 }
-		   };
-       }
+	if (threadIdx.x == 0) {
+
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			 0xf3bcc908, 0x6a09e667 ,
+			 0x84caa73b, 0xbb67ae85 ,
+			 0xfe94f82b, 0x3c6ef372 ,
+			 0x5f1d36f1, 0xa54ff53a ,
+			 0xade682d1, 0x510e527f ,
+			 0x2b3e6c1f, 0x9b05688c ,
+			 0xfb41bd6b, 0x1f83d9ab ,
+			 0x137e2179, 0x5be0cd19 
+		);
+		((uint16*)padding)[0] = make_uint16( 
+			 0x20, 0x0 ,
+			 0x20, 0x0 ,
+			 0x20, 0x0 ,
+			 0x01, 0x0 ,
+			 0x04, 0x0 ,
+			 0x04, 0x0 ,
+			 0x80, 0x0 ,
+			 0x0, 0x01000000 
+		);
+
+	}
 
 #if __CUDA_ARCH__ == 350
 	if (thread < threads)