Skip to content

Commit

Permalink
first released based on tpruvot interface include neoscrypt and yescrypt
Browse files Browse the repository at this point in the history
  • Loading branch information
djm34 committed May 1, 2015
1 parent 8b1137d commit 40e79ec
Show file tree
Hide file tree
Showing 50 changed files with 12,170 additions and 1,689 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ x64/Release/

compat/curl-for-windows/

*.bat
21 changes: 7 additions & 14 deletions JHA/cuda_jha_compactionTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include <memory.h>

#include "cuda_helper.h"
#include <sm_30_intrinsics.h>
#include <sm_30_intrinsics.h>

static uint32_t *d_tempBranch1Nonces[MAX_GPUS];
static uint32_t *d_numValid[MAX_GPUS];
Expand Down Expand Up @@ -265,9 +265,6 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t thre
jackpot_compactTest_gpu_SCAN<<<thr3,blockSize2, 32*sizeof(uint32_t)>>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2);
}

// Sync + Anzahl merken
cudaStreamSynchronize(NULL);

if(callThrid)
cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
else
Expand All @@ -286,7 +283,7 @@ __host__ void jackpot_compactTest_cpu_singleCompaction(int thr_id, uint32_t thre
function, orgThreads, startNounce, inpHashes, d_validNonceTable);

// Sync
cudaStreamSynchronize(NULL);
cudaDeviceSynchronize();
}

////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048)
Expand All @@ -307,7 +304,6 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t thread
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch1Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[0], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
Expand All @@ -316,7 +312,6 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t thread
jackpot_compactTest_gpu_SCAN<<<thr1,blockSize, 32*sizeof(uint32_t)>>>(d_tempBranch2Nonces[thr_id], 32, d_partSum1[thr_id], h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
jackpot_compactTest_gpu_SCAN<<<thr2,blockSize, 32*sizeof(uint32_t)>>>(d_partSum1[thr_id], 32, d_partSum2[thr_id]);
jackpot_compactTest_gpu_SCAN<<<1, thr2, 32*sizeof(uint32_t)>>>(d_partSum2[thr_id], (thr2>32) ? 32 : thr2);
cudaStreamSynchronize(NULL);
cudaMemcpy(&nrm[1], &(d_partSum2[thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost);
jackpot_compactTest_gpu_ADD<<<thr2-1, blockSize>>>(d_partSum1[thr_id]+blockSize, d_partSum2[thr_id], blockSize*thr2);
jackpot_compactTest_gpu_ADD<<<thr1-1, blockSize>>>(d_tempBranch2Nonces[thr_id]+blockSize, d_partSum1[thr_id], threads);
Expand All @@ -325,14 +320,13 @@ __host__ void jackpot_compactTest_cpu_dualCompaction(int thr_id, uint32_t thread
// Schritt 3: Scatter
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch1Nonces[thr_id], d_nonces1, h_JackpotTrueFunction[thr_id], threads, startNounce, inpHashes);
jackpot_compactTest_gpu_SCATTER<<<thr1,blockSize,0>>>(d_tempBranch2Nonces[thr_id], d_nonces2, h_JackpotFalseFunction[thr_id], threads, startNounce, inpHashes);
cudaStreamSynchronize(NULL);
cudaDeviceSynchronize();
*/
}

__host__ void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order)
uint32_t *d_nonces1, uint32_t *nrm1,
uint32_t *d_nonces2, uint32_t *nrm2)
{
// Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind
// "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen!
Expand All @@ -341,7 +335,6 @@ __host__ void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint
h_numValid[thr_id], d_nonces1, d_nonces2,
startNounce, inpHashes, d_validNonceTable);

cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser
*nrm1 = (size_t)h_numValid[thr_id][0];
*nrm2 = (size_t)h_numValid[thr_id][1];
*nrm1 = h_numValid[thr_id][0];
*nrm2 = h_numValid[thr_id][1];
}
14 changes: 5 additions & 9 deletions JHA/cuda_jha_keccak512.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "cuda_helper.h"

__constant__ uint64_t c_State[25];
__constant__ uint32_t c_PaddedMessage[18];
__constant__ uint32_t c_PaddedMessage[18];

#define U32TO64_LE(p) \
(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
Expand All @@ -31,7 +31,7 @@ __constant__ uint64_t c_keccak_round_constants[24];

static __device__ __forceinline__ void
keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
size_t i;
int i;
uint64_t t[5], u[5], v, w;

/* absorb input */
Expand Down Expand Up @@ -131,7 +131,7 @@ __global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounc
uint32_t hash[16];

#pragma unroll 8
for (size_t i = 0; i < 64; i += 8) {
for (int i = 0; i < 64; i += 8) {
U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
}

Expand Down Expand Up @@ -522,17 +522,13 @@ __host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
0, cudaMemcpyHostToDevice);
}

__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;

// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);

// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;

jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id);
jackpot_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
}
92 changes: 36 additions & 56 deletions JHA/jackpotcoin.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,31 @@ extern "C"
#include "sph/sph_skein.h"
}

#include "miner.h"
#include "miner.h"
#include "cuda_helper.h"

static uint32_t *d_hash[MAX_GPUS];

extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
extern void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);

extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);

extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);

extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);

extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void quark_skein512_cpu_init(int thr_id);
extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash);

extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
uint32_t *d_nonces1, size_t *nrm1,
uint32_t *d_nonces2, size_t *nrm2,
int order);
uint32_t *d_nonces1, uint32_t *nrm1,
uint32_t *d_nonces2, uint32_t *nrm2);

extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash);

// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
static uint32_t *d_jackpotNonces[MAX_GPUS];
Expand Down Expand Up @@ -93,25 +90,22 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
{
const uint32_t first_nonce = pdata[19];

uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20);
throughput = min(throughput, max_nonce - first_nonce);
uint32_t throughput = device_intensity(thr_id, __func__, 1U << 20);
throughput = min(throughput, (max_nonce - first_nonce));

if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000f;

if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
CUDA_CALL_OR_RET_X(cudaSetDevice(device_map[thr_id]), 0);

CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));

jackpot_keccak512_cpu_init(thr_id, throughput);
jackpot_compactTest_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);

quark_skein512_cpu_init(thr_id);
cuda_check_cpu_init(thr_id, throughput);

cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
Expand All @@ -131,83 +125,75 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
cuda_check_cpu_setTarget(ptarget);

do {
int order = 0;

// erstes Keccak512 Hash mit CUDA
jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
jackpot_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

size_t nrm1, nrm2, nrm3;
uint32_t nrm1, nrm2, nrm3;

// Runde 1 (ohne Gröstl)

jackpot_compactTest_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL,
d_branch1Nonces[thr_id], &nrm1,
d_branch3Nonces[thr_id], &nrm3,
order++);
d_branch3Nonces[thr_id], &nrm3);

// verfolge den skein-pfad weiter
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);

// noch schnell Blake & JH
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
d_branch2Nonces[thr_id], &nrm2);

if (nrm1+nrm2 == nrm3) {
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
}

// Runde 3 (komplett)

// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
d_branch2Nonces[thr_id], &nrm2);

if (nrm1+nrm2 == nrm3) {
quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
}

// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
d_branch2Nonces[thr_id], &nrm2);

if (nrm1+nrm2 == nrm3) {
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
}

// Runde 3 (komplett)

// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
d_branch2Nonces[thr_id], &nrm2);

if (nrm1+nrm2 == nrm3) {
quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
}

// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch2Nonces[thr_id], &nrm2,
order++);
d_branch2Nonces[thr_id], &nrm2);

if (nrm1+nrm2 == nrm3) {
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id]);
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id]);
}

uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id]);
if (foundNonce != 0xffffffff)
{
unsigned int rounds;
Expand All @@ -220,7 +206,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,

if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
int res = 1;
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonce);
*hashes_done = pdata[19] - first_nonce + throughput;
if (secNonce != 0) {
pdata[21] = secNonce;
Expand All @@ -234,14 +220,8 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
}
}

if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
pdata[19] = max_nonce;
break;
}

pdata[19] += throughput;

} while (!work_restart[thr_id].restart);
} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));

*hashes_done = pdata[19] - first_nonce + 1;
return 0;
Expand Down
5 changes: 5 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
sph/yescryptcommon.c sph/yescrypt-opt.c sph/sha256_Y.c \
sph/sha256_Y sph/yescrypt.h sph/sysendian.h \
sph/neoscrypt.h sph/neoscrypt.c \
pluck/pluck.cu pluck/cuda_pluck.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/doom.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
Expand All @@ -55,6 +58,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/whirlpoolx.cu x15/cuda_whirlpoolx.cu \
x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \
yescrypt/cuda_yescrypt.cu yescrypt/yescrypt.cu \
neocrypt/cuda_neocrypt.cu neocrypt/neocrypt.cu \
x11/s3.cu

if HAVE_NVML
Expand Down
Loading

0 comments on commit 40e79ec

Please sign in to comment.