Permalink
Browse files

This is a mostly performance-neutral (maybe immeasurably positive) co…

…mmit in preparation for later structural changes.
  • Loading branch information...
1 parent beb3512 commit b81b1d0a20e1ca6baaeb5db20dae74190d87638a @dave-andersen committed Jan 12, 2014
Showing with 18 additions and 9 deletions.
  1. +12 −9 src/gpuhash.cu
  2. +6 −0 src/gpuhash.h
View
@@ -31,6 +31,7 @@ int GPUHasher::Initialize() {
exit(-1);
}
+ cudaStream_t *streamptr = (cudaStream_t *)opaqueStream_t;
error = cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
size_t free, total;
@@ -44,6 +45,8 @@ int GPUHasher::Initialize() {
return -1;
}
+ cudaStreamCreate(streamptr);
+
#define MOMENTUM_N_HASHES (1<<26)
/* Note: This is the allocation size. We can only use
* one less than this because each countbit entry uses two bits. */
@@ -83,6 +86,7 @@ GPUHasher::~GPUHasher() {
int GPUHasher::ComputeHashes(uint64_t data[16], uint64_t *hashes) {
cudaError_t error;
+ cudaStream_t *streamptr = (cudaStream_t *)opaqueStream_t;
error = cudaMemcpy(dev_data, data, sizeof(uint64_t)*16, cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy dev_data (%d)\n", error);
@@ -94,22 +98,21 @@ int GPUHasher::ComputeHashes(uint64_t data[16], uint64_t *hashes) {
// 1024 grid slots
dim3 gridsize(4096,32);
- cudaMemset(dev_results, 0, sizeof(uint64_t)*N_RESULTS);
- cudaMemset(dev_countbits, 0, sizeof(uint32_t)*NUM_COUNTBITS_WORDS);
- search_sha512_kernel<<<gridsize, 64>>>(dev_data, dev_hashes, dev_countbits);
- filter_sha512_kernel<<<gridsize, 64>>>(dev_hashes, dev_countbits);
- cudaMemset(dev_countbits, 0, sizeof(uint32_t)*NUM_COUNTBITS_WORDS);
- populate_filter_kernel<<<gridsize, 64>>>(dev_hashes, dev_countbits);
- filter_and_rewrite_sha512_kernel<<<gridsize, 64>>>(dev_hashes, dev_countbits, dev_results);
+ cudaMemsetAsync(dev_results, 0, sizeof(uint64_t)*N_RESULTS, *streamptr);
+ cudaMemsetAsync(dev_countbits, 0, sizeof(uint32_t)*NUM_COUNTBITS_WORDS, *streamptr);
+ search_sha512_kernel<<<gridsize, 64, 0, *streamptr>>>(dev_data, dev_hashes, dev_countbits);
+ filter_sha512_kernel<<<gridsize, 64, 0, *streamptr>>>(dev_hashes, dev_countbits);
+ cudaMemsetAsync(dev_countbits, 0, sizeof(uint32_t)*NUM_COUNTBITS_WORDS, *streamptr);
+ populate_filter_kernel<<<gridsize, 64, 0, *streamptr>>>(dev_hashes, dev_countbits);
+ filter_and_rewrite_sha512_kernel<<<gridsize, 64, 0, *streamptr>>>(dev_hashes, dev_countbits, dev_results);
+ error = cudaMemcpyAsync(hashes, dev_results, sizeof(uint64_t)*N_RESULTS, cudaMemcpyDeviceToHost, *streamptr);
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
fprintf(stderr, "Error in kernel exec (%d)\n", error);
return -1;
}
- error = cudaMemcpy(hashes, dev_results, sizeof(uint64_t)*N_RESULTS, cudaMemcpyDeviceToHost);
-
if (error != cudaSuccess) {
fprintf(stderr, "Could not memcpy dev_hashes out (%d)\n", error);
return -1;
View
@@ -13,4 +13,10 @@ class GPUHasher {
uint64_t *dev_hashes;
uint32_t *dev_countbits;
uint64_t *dev_results;
+
+ /* This is an opaque blob that holds a cudaStream_t, but is not
+ * exposed in the header so that the caller code does not need to
+ * include any cuda header files.
+ */
+ uint8_t opaqueStream_t[64];
};

1 comment on commit b81b1d0

Not good, this change makes AWS g2.2xlarge go from ~790 to ~550

Please sign in to comment.