Skip to content

Commit

Permalink
resolve compatibility issues with linux and vector initializations
Browse files Browse the repository at this point in the history
make_uint16 works for both linux and windows (at the same time),
previous logic were mutually exclusive
  • Loading branch information
djm34 committed Aug 2, 2015
1 parent da9aa83 commit 8364fbd
Showing 1 changed file with 47 additions and 146 deletions.
193 changes: 47 additions & 146 deletions lyra2/cuda_lyra2v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,127 +26,25 @@ __device__ vectype *DMatrix;


static __device__ __forceinline__ void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
{

a += b; d ^= a; d = SWAPDWORDS2(d);
c += d; b ^= c; b = ROR24(b);
a += b; d ^= a; d = ROR16(d);
c += d; b ^= c; b = ROR2(b, 63);

}

static __device__ __forceinline__ void Gfunc_v35_p1(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
{

a += b; d ^= a; d = SWAPDWORDS2(d);
c += d; b ^= c; b = ROR24(b);
}

static __device__ __forceinline__ void Gfunc_v35_p2(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
{
a += b; d ^= a; d = ROR16(d);
c += d; b ^= c; b = ROR2(b, 63);
}


static __device__ __forceinline__ void Gfunc_v35(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
{

a += b; d ^= a; d = ROTR64(d, 32);
c += d; b ^= c; b = ROTR64(b, 24);
a += b; d ^= a; d = ROTR64(d, 16);
c += d; b ^= c; b = ROTR64(b, 63);

}

static __device__ __forceinline__ void Gfunc_v35_p1(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
static __device__ __forceinline__ void Gfunc_v35(unsigned long long & a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
{

a += b; d ^= a; d = ROTR64(d, 32);
c += d; b ^= c; b = ROTR64(b, 24);
}

static __device__ __forceinline__ void Gfunc_v35_p2(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
{

a += b; d ^= a; d = ROTR64(d, 16);
c += d; b ^= c; b = ROTR64(b, 63);
}

#define RORa(d) make_uint28(SWAPDWORDS2(d.x),SWAPDWORDS2(d.y),SWAPDWORDS2(d.z),SWAPDWORDS2(d.w))
#define RORb(d) make_uint28(ROR24(d.x),ROR24(d.y),ROR24(d.z),ROR24(d.w))
#define RORc(d) make_uint28(ROR16(d.x),ROR16(d.y),ROR16(d.z),ROR16(d.w))
#define RORd(d) make_uint28(ROR2(d.x,63),ROR2(d.y,63),ROR2(d.z,63),ROR2(d.w,63))


static __device__ __forceinline__ ulonglong4 make_vectype(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d)
{
return make_ulonglong4(a, b, c, d);
}

static __device__ __forceinline__ uint28 make_vectype(const uint2 a, const uint2 b, const uint2 c, const uint2 d)
{
return make_uint28(a, b, c, d);
}


static __device__ __forceinline__ void Gfunc_v4(ulonglong4 & a, ulonglong4 &b, ulonglong4 &c, ulonglong4 &d)
{
#define ROR4(d,n) make_ulonglong4(ROTR64(d.x,n),ROTR64(d.y,n),ROTR64(d.z,n),ROTR64(d.w,n))
a += b; d ^= a; d = ROR4(d, 32);
c += d; b ^= c; b = ROR4(b, 24);
a += b; d ^= a; d = ROR4(d, 16);
c += d; b ^= c; b = ROR4(b, 63);
#undef ROR4
}

static __device__ __forceinline__ void Gfunc_v4(uint28 & a, uint28 &b, uint28 &c, uint28 &d)
{
#define ROR4(d,n) make_uint28(ROR2(d.x,n),ROR2(d.y,n),ROR2(d.z,n),ROR2(d.w,n))
a += b; d ^= a; d = RORa(d);
c += d; b ^= c; b = RORb(b);
a += b; d ^= a; d = RORc(d);
c += d; b ^= c; b = RORd(b);
#undef ROR4
}



static __device__ __forceinline__ void round_lyra64(uint64_t* s)
{
Gfunc_v35(s[0], s[4], s[8], s[12]);
Gfunc_v35(s[1], s[5], s[9], s[13]);
Gfunc_v35(s[2], s[6], s[10], s[14]);
Gfunc_v35(s[3], s[7], s[11], s[15]);
Gfunc_v35(s[0], s[5], s[10], s[15]);
Gfunc_v35(s[1], s[6], s[11], s[12]);
Gfunc_v35(s[2], s[7], s[8], s[13]);
Gfunc_v35(s[3], s[4], s[9], s[14]);
}

static __device__ __forceinline__ void round_lyra_v35(uint2_16* s)
{
Gfunc_v35(s[0].s0, s[0].s4, s[0].s8, s[0].sc);
Gfunc_v35(s[0].s1, s[0].s5, s[0].s9, s[0].sd);
Gfunc_v35(s[0].s2, s[0].s6, s[0].sa, s[0].se);
Gfunc_v35(s[0].s3, s[0].s7, s[0].sb, s[0].sf);
Gfunc_v35(s[0].s0, s[0].s5, s[0].sa, s[0].sf);
Gfunc_v35(s[0].s1, s[0].s6, s[0].sb, s[0].sc);
Gfunc_v35(s[0].s2, s[0].s7, s[0].s8, s[0].sd);
Gfunc_v35(s[0].s3, s[0].s4, s[0].s9, s[0].se);
}

static __device__ __forceinline__ void round_lyra_v35(uint2* s)
{
Gfunc_v35(s[0], s[4], s[8], s[12]);
Gfunc_v35(s[1], s[5], s[9], s[13]);
Gfunc_v35(s[2], s[6], s[10], s[14]);
Gfunc_v35(s[3], s[7], s[11], s[15]);
Gfunc_v35(s[0], s[5], s[10], s[15]);
Gfunc_v35(s[1], s[6], s[11], s[12]);
Gfunc_v35(s[2], s[7], s[8], s[13]);
Gfunc_v35(s[3], s[4], s[9], s[14]);
}

static __device__ __forceinline__ void round_lyra_v35(vectype* s)
{
Expand All @@ -162,6 +60,8 @@ static __device__ __forceinline__ void round_lyra_v35(vectype* s)
Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);

}



static __device__ __forceinline__ void reduceDuplex(vectype state[4], uint32_t thread)
{
Expand Down Expand Up @@ -467,26 +367,26 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu
uint28 padding[2];
if (threadIdx.x == 0) {

((uint2_8*)blake2b_IV)[0] = {
{ 0xf3bcc908, 0x6a09e667 },
{ 0x84caa73b, 0xbb67ae85 },
{ 0xfe94f82b, 0x3c6ef372 },
{ 0x5f1d36f1, 0xa54ff53a },
{ 0xade682d1, 0x510e527f },
{ 0x2b3e6c1f, 0x9b05688c },
{ 0xfb41bd6b, 0x1f83d9ab },
{ 0x137e2179, 0x5be0cd19 }
};
((uint2_8*)padding)[0] = {
{ 0x20, 0x0 },
{ 0x20, 0x0 },
{ 0x20, 0x0 },
{ 0x01, 0x0 },
{ 0x04, 0x0 },
{ 0x04, 0x0 },
{ 0x80, 0x0 },
{ 0x0, 0x01000000 }
};
((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667 ,
0x84caa73b, 0xbb67ae85 ,
0xfe94f82b, 0x3c6ef372 ,
0x5f1d36f1, 0xa54ff53a ,
0xade682d1, 0x510e527f ,
0x2b3e6c1f, 0x9b05688c ,
0xfb41bd6b, 0x1f83d9ab ,
0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0 ,
0x20, 0x0 ,
0x20, 0x0 ,
0x01, 0x0 ,
0x04, 0x0 ,
0x04, 0x0 ,
0x80, 0x0 ,
0x0, 0x01000000
);

}

Expand Down Expand Up @@ -571,29 +471,30 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa

uint28 blake2b_IV[2];
uint28 padding[2];
if (threadIdx.x == 0) {

((uint2_8*)blake2b_IV)[0] = {
{ 0xf3bcc908, 0x6a09e667 },
{ 0x84caa73b, 0xbb67ae85 },
{ 0xfe94f82b, 0x3c6ef372 },
{ 0x5f1d36f1, 0xa54ff53a },
{ 0xade682d1, 0x510e527f },
{ 0x2b3e6c1f, 0x9b05688c },
{ 0xfb41bd6b, 0x1f83d9ab },
{ 0x137e2179, 0x5be0cd19 }
};
((uint2_8*)padding)[0] = {
{ 0x20, 0x0 },
{ 0x20, 0x0 },
{ 0x20, 0x0 },
{ 0x01, 0x0 },
{ 0x04, 0x0 },
{ 0x04, 0x0 },
{ 0x80, 0x0 },
{ 0x0, 0x01000000 }
};
}
if (threadIdx.x == 0) {

((uint16*)blake2b_IV)[0] = make_uint16(
0xf3bcc908, 0x6a09e667 ,
0x84caa73b, 0xbb67ae85 ,
0xfe94f82b, 0x3c6ef372 ,
0x5f1d36f1, 0xa54ff53a ,
0xade682d1, 0x510e527f ,
0x2b3e6c1f, 0x9b05688c ,
0xfb41bd6b, 0x1f83d9ab ,
0x137e2179, 0x5be0cd19
);
((uint16*)padding)[0] = make_uint16(
0x20, 0x0 ,
0x20, 0x0 ,
0x20, 0x0 ,
0x01, 0x0 ,
0x04, 0x0 ,
0x04, 0x0 ,
0x80, 0x0 ,
0x0, 0x01000000
);

}

#if __CUDA_ARCH__ == 350
if (thread < threads)
Expand Down

0 comments on commit 8364fbd

Please sign in to comment.