diff --git a/block.go b/block.go
index b3ca29d..6dde97c 100644
--- a/block.go
+++ b/block.go
@@ -179,11 +179,11 @@ func (header BlockHeader) ID() (BlockID, error) {
 }
 
 // IDFast computes an ID for a given block header when mining.
-func (header *BlockHeader) IDFast() *big.Int {
+func (header *BlockHeader) IDFast(minerNum int) (*big.Int, int64) {
 	if header.hasher == nil {
 		header.hasher = NewBlockHeaderHasher()
 	}
-	return header.hasher.Update(header)
+	return header.hasher.Update(minerNum, header)
 }
 
 // Compare returns true if the header indicates it is a better chain than "theirHeader" up to both points.
diff --git a/block_header_hasher.go b/block_header_hasher.go
index f282c7d..837d304 100644
--- a/block_header_hasher.go
+++ b/block_header_hasher.go
@@ -6,6 +6,7 @@ package cruzbit
 import (
 	"encoding/hex"
 	"hash"
+	"log"
 	"math/big"
 	"strconv"
 
@@ -32,12 +33,13 @@ type BlockHeaderHasher struct {
 	txCountLen int
 
 	// used for hashing
-	initialized bool
-	bufLen      int
-	buffer      []byte
-	hasher      HashWithRead
-	resultBuf   [32]byte
-	result      *big.Int
+	initialized      bool
+	bufLen           int
+	buffer           []byte
+	hasher           HashWithRead
+	resultBuf        [32]byte
+	result           *big.Int
+	hashesPerAttempt int64
 }
 
 // HashWithRead extends hash.Hash to provide a Read interface.
@@ -69,9 +71,10 @@ func NewBlockHeaderHasher() *BlockHeaderHasher {
 
 	// initialize the hasher
 	return &BlockHeaderHasher{
-		buffer: make([]byte, bufLen),
-		hasher: sha3.New256().(HashWithRead),
-		result: new(big.Int),
+		buffer:           make([]byte, bufLen),
+		hasher:           sha3.New256().(HashWithRead),
+		result:           new(big.Int),
+		hashesPerAttempt: 1,
 	}
 }
 
@@ -146,12 +149,20 @@ func (h *BlockHeaderHasher) initBuffer(header *BlockHeader) {
 }
 
 // Update is called everytime the header is updated and the caller wants its new hash value/ID.
-func (h *BlockHeaderHasher) Update(header *BlockHeader) *big.Int {
+func (h *BlockHeaderHasher) Update(minerNum int, header *BlockHeader) (*big.Int, int64) {
 	if !h.initialized {
 		h.initBuffer(header)
+		if CUDA_ENABLED {
+			lastOffset := h.nonceOffset + h.nonceLen
+			h.hashesPerAttempt = CudaMinerUpdate(minerNum, h.buffer, h.bufLen,
+				h.nonceOffset, lastOffset, header.Target)
+		}
 	} else {
+		var bufferChanged bool
+
 		// hash_list_root
 		if h.previousHashListRoot != header.HashListRoot {
+			bufferChanged = true
 			// write out the new value
 			h.previousHashListRoot = header.HashListRoot
 			hex.Encode(h.buffer[h.hashListRootOffset:], header.HashListRoot[:])
@@ -161,6 +172,7 @@ func (h *BlockHeaderHasher) Update(header *BlockHeader) *big.Int {
 
 		// time
 		if h.previousTime != header.Time {
+			bufferChanged = true
 			h.previousTime = header.Time
 
 			// write out the new value
@@ -194,7 +206,8 @@ func (h *BlockHeaderHasher) Update(header *BlockHeader) *big.Int {
 		}
 
 		// nonce
-		if offset != 0 || h.previousNonce != header.Nonce {
+		if offset != 0 || (!CUDA_ENABLED && h.previousNonce != header.Nonce) {
+			bufferChanged = true
 			h.previousNonce = header.Nonce
 
 			// write out the new value (or old value at a new location)
@@ -221,6 +234,7 @@ func (h *BlockHeaderHasher) Update(header *BlockHeader) *big.Int {
 
 		// transaction_count
 		if offset != 0 || h.previousTransactionCount != header.TransactionCount {
+			bufferChanged = true
 			h.previousTransactionCount = header.TransactionCount
 
 			// write out the new value (or old value at a new location)
@@ -241,6 +255,35 @@ func (h *BlockHeaderHasher) Update(header *BlockHeader) *big.Int {
 
 		// it's possible (likely) we did a bunch of encoding with no net impact to the buffer length
 		h.bufLen += offset
+
+		if CUDA_ENABLED && bufferChanged {
+			// something besides the nonce changed since last time. update the buffers in CUDA.
+			lastOffset := h.nonceOffset + h.nonceLen
+			h.hashesPerAttempt = CudaMinerUpdate(minerNum, h.buffer, h.bufLen,
+				h.nonceOffset, lastOffset, header.Target)
+		}
+	}
+
+	if CUDA_ENABLED {
+		var nonce int64 = CudaMinerMine(minerNum, header.Nonce)
+		if nonce == 0x7FFFFFFFFFFFFFFF {
+			h.result.SetBytes(
+				// indirectly let miner.go know we failed
+				[]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+					0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+			)
+			// -1 here for the +1 done in miner.go
+			header.Nonce += h.hashesPerAttempt - 1
+			return h.result, h.hashesPerAttempt
+		} else {
+			log.Printf("CUDA miner %d found a possible solution: %d, double-checking it...\n", minerNum, nonce)
+			// rebuild the buffer with the new nonce since we don't update it
+			// per attempt when using CUDA.
+			header.Nonce = nonce
+			h.initBuffer(header)
+		}
 	}
 
 	// hash it
@@ -248,5 +291,5 @@ func (h *BlockHeaderHasher) Update(header *BlockHeader) *big.Int {
 	h.hasher.Write(h.buffer[:h.bufLen])
 	h.hasher.Read(h.resultBuf[:])
 	h.result.SetBytes(h.resultBuf[:])
-	return h.result
+	return h.result, h.hashesPerAttempt
 }
diff --git a/client/main.go b/client/main.go
index 2296d9c..6ba41fa 100644
--- a/client/main.go
+++ b/client/main.go
@@ -68,6 +68,15 @@ func main() {
 		}
 	}
 
+	// initialize CUDA if enabled
+	if CUDA_ENABLED && *numMinersPtr > 0 {
+		deviceCount := CudaInit()
+		if deviceCount != *numMinersPtr {
+			log.Fatalf("CUDA enabled but -numminers is %d and supported devices is %d\n",
+				*numMinersPtr, deviceCount)
+		}
+	}
+
 	// load genesis block
 	genesisBlock := new(Block)
 	if err := json.Unmarshal([]byte(GenesisBlockJson), genesisBlock); err != nil {
diff --git a/cuda.go b/cuda.go
new file mode 100644
index 0000000..faa540e
--- /dev/null
+++ b/cuda.go
@@ -0,0 +1,41 @@
+// +build cuda
+// Copyright 2019 cruzbit developers
+// Use of this source code is governed by a MIT-style license that can be found in the LICENSE file.
+
+package cruzbit
+
+//#cgo LDFLAGS: -L./cuda/build -lcuda
+//
+// #include <stdint.h>
+//
+// extern int cuda_init();
+// extern int miner_update(int miner_num, const void *first, size_t first_len, const void *last,
+//     size_t last_len, const void *target);
+// extern int64_t miner_mine(int miner_num, int64_t start_nonce);
+//
+import "C"
+
+import (
+	"unsafe"
+)
+
+const CUDA_ENABLED = true
+
+// CudaInit is called on startup.
+func CudaInit() int {
+	return int(C.cuda_init())
+}
+
+// CudaMinerUpdate is called by a miner goroutine when the underlying header changes.
+func CudaMinerUpdate(minerNum int, headerBytes []byte, headerBytesLen, startNonceOffset, endNonceOffset int, target BlockID) int64 {
+	return int64(C.miner_update(C.int(minerNum), unsafe.Pointer(&headerBytes[0]), C.size_t(startNonceOffset),
+		unsafe.Pointer(&headerBytes[endNonceOffset]), C.size_t(headerBytesLen-endNonceOffset),
+		unsafe.Pointer(&target[0])))
+}
+
+// CudaMine is called on every solution attempt by a miner goroutine.
+// It will perform N hashing attempts where N is the maximum number of threads your device is capable of executing.
+// Returns a solving nonce; otherwise 0x7FFFFFFFFFFFFFFF.
+func CudaMinerMine(minerNum int, startNonce int64) int64 {
+	return int64(C.miner_mine(C.int(minerNum), C.int64_t(startNonce)))
+}
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
new file mode 100644
index 0000000..8cff934
--- /dev/null
+++ b/cuda/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required(VERSION 3.12)
+project(cuda)
+find_package(CUDA REQUIRED)
+set(CUDA_SEPARABLE_COMPILATION ON)
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -Xcompiler -fPIC)
+CUDA_ADD_LIBRARY(cuda SHARED
+  mine.cu
+  sha3.cu
+  )
diff --git a/cuda/LICENSE b/cuda/LICENSE
new file mode 100644
index 0000000..d2d484d
--- /dev/null
+++ b/cuda/LICENSE
@@ -0,0 +1,22 @@
+The MIT License (MIT)
+
+Copyright (c) 2015 Markku-Juhani O. Saarinen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/cuda/README.md b/cuda/README.md
new file mode 100644
index 0000000..140ee42
--- /dev/null
+++ b/cuda/README.md
@@ -0,0 +1,63 @@
+# This is a CUDA port of tiny_sha3 which also includes mining specific calls for cruzbit. Original project's README follows. -asdvxgxasjab 19-Jul-19
+
+# tiny_sha3
+Very small, readable implementation of the FIPS 202 and SHA3 hash function.
+Public domain.
+
+### Updated 27-Dec-15:
+
+Added SHAKE128 and SHAKE256 code and test vectors. The code can actually do
+a XOF of arbitrary size (like "SHAKE512").
+
+
+### Updated 03-Sep-15:
+
+Made the implementation portable. The API is now pretty much the
+same that OpenSSL uses. 
+
+
+### Updated 07-Aug-15:
+
+Now that SHA3 spec is out, I've updated the package to match with the
+new padding rules. There is literally one line difference between 
+Keccak 3.0 and SHA-3 implementations:
+
+```
+    temp[inlen++] = 0x06;           // XXX Padding Changed from Keccak 3.0
+```
+
+The 0x06 constant there used to be 0x01. But this of course totally 
+breaks compatibility and test vectors had to be revised.
+
+SHA-3 Spec: http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+
+Cheers,
+- markku
+
+
+### Original README.TXT from 19-Nov-11:
+
+Hi.
+
+The SHA-3 competition is nearing it's end and I would personally like
+to support Keccak as the winner. I have a PhD in hash function cryptanalysis
+so don't take my word for it, go ahead and look into the code !
+
+Since I couldn't find a *compact* and/or *readable* implementation of Keccak
+anywhere, here's one I cooked up as a service to the curious.
+
+This implementation is intended for study of the algorithm, not for
+production use.
+
+The code works correctly on 64-bit little-endian platforms with gcc.
+Like your Linux box. The main.c module contains self-tests for all
+officially supported hash sizes.
+
+If you're looking for production code, the official multi-megabyte package
+covers everyting you could possibly need and too much much more:
+http://keccak.noekeon.org/
+
+Cheers,
+- Markku  19-Nov-11
+
+Dr. Markku-Juhani O. Saarinen <mjos@iki.fi>
diff --git a/cuda/mine.cu b/cuda/mine.cu
new file mode 100644
index 0000000..0656624
--- /dev/null
+++ b/cuda/mine.cu
@@ -0,0 +1,255 @@
+#include "sha3.h"
+#include <cuda.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine the #
+  // of cores per SM
+  typedef struct {
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
+            // minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+      {0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+      {0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+      {0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+      {0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+      {0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+      {0x53, 128}, // Maxwell Generation (SM 5.3) GM20x class
+      {0x60, 64},  // Pascal Generation (SM 6.0) GP100 class
+      {0x61, 128}, // Pascal Generation (SM 6.1) GP10x class
+      {0x62, 128}, // Pascal Generation (SM 6.2) GP10x class
+      {0x70, 64},  // Volta Generation (SM 7.0) GV100 class
+      {-1, -1}};
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one to run
+  // properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+__device__ int memcmp_cu(const void *p1, const void *p2, size_t len) {
+  for (size_t i = 0; i < len; i++) {
+    uint8_t b1 = ((uint8_t *)p1)[i];
+    uint8_t b2 = ((uint8_t *)p2)[i];
+    if (b1 < b2) {
+      return -1;
+    }
+    if (b1 > b2) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+__device__ int strlen_cu(char *s) {
+  int i;
+  for (i = 0; s[i] != '\0';) {
+    i++;
+  }
+  return i;
+}
+
+__device__ char *reverse(char *str) {
+  char tmp, *src, *dst;
+  size_t len;
+  if (str != NULL) {
+    len = strlen_cu(str);
+    if (len > 1) {
+      src = str;
+      dst = src + len - 1;
+      while (src < dst) {
+        tmp = *src;
+        *src++ = *dst;
+        *dst-- = tmp;
+      }
+    }
+  }
+  return str;
+}
+
+__device__ int itoa(int64_t n, char s[]) {
+  int i;
+  int64_t sign;
+
+  if ((sign = n) < 0) /* record sign */
+    n = -n;           /* make n positive */
+  i = 0;
+
+  do {                     /* generate digits in reverse order */
+    s[i++] = n % 10 + '0'; /* get next digit */
+  } while ((n /= 10) > 0); /* delete it */
+
+  if (sign < 0)
+    s[i++] = '-';
+
+  s[i] = '\0';
+  reverse(s);
+  return i;
+}
+
+__device__ void debug_print_buf(const void *buf, size_t len) {
+  for (int i = 0; i < len; i++) {
+    printf("%c", ((char *)buf)[i]);
+  }
+  printf("\n");
+}
+
+__device__ void debug_print_hash(const void *hash) {
+  for (int i = 0; i < 32; i++) {
+    printf("%02x", ((char *)hash)[i] & 0xFF);
+  }
+  printf("\n");
+}
+
+// called from the gpu kernel
+__global__ void do_sha3(const void *first, size_t first_len, const void *last,
+                        size_t last_len, int64_t start_nonce, void *target,
+                        int64_t *good_nonce, int *hashes) {
+  uint8_t hash[32];
+  uint8_t nonce_s[20];
+
+  int index = blockDim.x * blockIdx.x + threadIdx.x;
+  int64_t nonce = start_nonce + (int64_t)index;
+  size_t n = (size_t)itoa(nonce, (char *)nonce_s);
+
+  sha3_ctx_t sha3;
+
+  sha3_init_cu(&sha3, 32);
+  sha3_update_cu(&sha3, first, first_len);
+  sha3_update_cu(&sha3, nonce_s, n);
+  sha3_update_cu(&sha3, last, last_len);
+  sha3_final_cu(hash, &sha3);
+
+  // atomicAdd(hashes, 1);
+#if 0
+  if (index == 0) {
+    debug_print_buf(first, first_len);
+    debug_print_buf(nonce_s, n);
+    debug_print_buf(last, last_len);
+    debug_print_hash(hash);
+    debug_print_hash(target);
+  }
+#endif
+
+  if (memcmp_cu(hash, target, 32) <= 0) {
+#if 0
+    debug_print_buf(first, first_len);
+    debug_print_buf(nonce_s, n);
+    debug_print_buf(last, last_len);
+    debug_print_hash(target);
+    debug_print_hash((char *)hash);
+#endif
+    // found a solution. not thread-safe but a race is very unlikely
+    *good_nonce = nonce;
+  }
+}
+
+struct miner_state {
+  void *first_cu, *last_cu, *target_cu;
+  size_t first_len, last_len;
+  int num_blocks, block_size, max_threads;
+  int64_t *nonce_cu;
+  int *hashes_cu;
+};
+
+static struct miner_state *states = 0;
+
+extern "C" {
+
+// called on startup
+int cuda_init() {
+  int device_count = -1;
+  cudaGetDeviceCount(&device_count);
+  if (device_count <= 0) {
+    return -1;
+  }
+
+  states = new struct miner_state[device_count];
+
+  for (int i = 0; i < device_count; i++) {
+    cudaDeviceProp props;
+    cudaGetDeviceProperties(&props, i);
+    int cores = props.major == 9999 && props.minor == 9999
+                    ? 1
+                    : _ConvertSMVer2Cores(props.major, props.minor);
+    cores *= props.multiProcessorCount;
+
+    states[i].max_threads =
+        props.maxThreadsPerMultiProcessor * props.multiProcessorCount;
+    states[i].block_size = props.warpSize;
+    states[i].num_blocks = states[i].max_threads / states[i].block_size;
+
+    // allocate memory used on device
+    cudaMalloc(&states[i].first_cu, 512);
+    cudaMalloc(&states[i].last_cu, 512);
+    cudaMalloc(&states[i].target_cu, 32);
+    cudaMalloc(&states[i].nonce_cu, sizeof(int64_t));
+    cudaMalloc(&states[i].hashes_cu, sizeof(int));
+
+    cudaMemset(states[i].hashes_cu, 0, sizeof(int));
+    cudaMemset(states[i].nonce_cu, 0x7F, sizeof(int64_t));
+    cudaMemset(states[i].nonce_cu, 0xFF, sizeof(int64_t) - 1);
+  }
+
+  return device_count;
+}
+
+// called after updating the block header
+int miner_update(int miner_num, const void *first, size_t first_len,
+                 const void *last, size_t last_len, const void *target) {
+  cudaSetDevice(miner_num);
+
+  // copy the first part of the header
+  states[miner_num].first_len = first_len;
+  cudaMemcpy(states[miner_num].first_cu, first, first_len,
+             cudaMemcpyHostToDevice);
+
+  // copy the end part of the header
+  states[miner_num].last_len = last_len;
+  cudaMemcpy(states[miner_num].last_cu, last, last_len, cudaMemcpyHostToDevice);
+
+  // copy the target
+  cudaMemcpy(states[miner_num].target_cu, target, 32, cudaMemcpyHostToDevice);
+
+  // clear nonce
+  cudaMemset(states[miner_num].nonce_cu, 0x7F, sizeof(int64_t));
+  cudaMemset(states[miner_num].nonce_cu, 0xFF, sizeof(int64_t) - 1);
+
+  return states[miner_num].num_blocks * states[miner_num].block_size;
+}
+
+// called in a loop until solved
+// returns a solving nonce if found; otherwise 0x7FFFFFFFFFFFFFFF
+int64_t miner_mine(int miner_num, int64_t start_nonce) {
+  cudaSetDevice(miner_num);
+  int64_t nonce;
+  cudaMemset(states[miner_num].hashes_cu, 0, sizeof(int));
+  int num_blocks = states[miner_num].num_blocks;
+  int block_size = states[miner_num].block_size;
+  do_sha3<<<num_blocks, block_size>>>(
+      states[miner_num].first_cu, states[miner_num].first_len,
+      states[miner_num].last_cu, states[miner_num].last_len, start_nonce,
+      states[miner_num].target_cu, states[miner_num].nonce_cu,
+      states[miner_num].hashes_cu);
+  cudaDeviceSynchronize();
+  cudaMemcpy(&nonce, states[miner_num].nonce_cu, sizeof(int64_t),
+             cudaMemcpyDeviceToHost);
+  return nonce;
+}
+}
diff --git a/cuda/sha3.cu b/cuda/sha3.cu
new file mode 100644
index 0000000..9c16325
--- /dev/null
+++ b/cuda/sha3.cu
@@ -0,0 +1,158 @@
+// sha3.c
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+// Revised 07-Aug-15 to match with official release of FIPS PUB 202 "SHA3"
+// Revised 03-Sep-15 for portability + OpenSSL - style API
+
+#include "sha3.h"
+
+// update the state with given number of rounds
+
+__device__ void sha3_keccakf_cu(uint64_t st[25]) {
+  // constants
+  const uint64_t keccakf_rndc[24] = {
+      0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
+      0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
+      0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
+      0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
+      0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
+      0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
+      0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
+      0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
+  const int keccakf_rotc[24] = {1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+                                27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44};
+  const int keccakf_piln[24] = {10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+                                15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1};
+
+  // variables
+  int i, j, r;
+  uint64_t t, bc[5];
+
+#if 0
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  uint8_t *v;
+
+  // endianess conversion. this is redundant on little-endian targets
+  for (i = 0; i < 25; i++) {
+    v = (uint8_t *)&st[i];
+    st[i] = ((uint64_t)v[0]) | (((uint64_t)v[1]) << 8) |
+            (((uint64_t)v[2]) << 16) | (((uint64_t)v[3]) << 24) |
+            (((uint64_t)v[4]) << 32) | (((uint64_t)v[5]) << 40) |
+            (((uint64_t)v[6]) << 48) | (((uint64_t)v[7]) << 56);
+  }
+#endif
+#endif
+
+  // actual iteration
+  for (r = 0; r < KECCAKF_ROUNDS; r++) {
+
+    // Theta
+    for (i = 0; i < 5; i++)
+      bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
+
+    for (i = 0; i < 5; i++) {
+      t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+      for (j = 0; j < 25; j += 5)
+        st[j + i] ^= t;
+    }
+
+    // Rho Pi
+    t = st[1];
+    for (i = 0; i < 24; i++) {
+      j = keccakf_piln[i];
+      bc[0] = st[j];
+      st[j] = ROTL64(t, keccakf_rotc[i]);
+      t = bc[0];
+    }
+
+    //  Chi
+    for (j = 0; j < 25; j += 5) {
+      for (i = 0; i < 5; i++)
+        bc[i] = st[j + i];
+      for (i = 0; i < 5; i++)
+        st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
+    }
+
+    //  Iota
+    st[0] ^= keccakf_rndc[r];
+  }
+
+#if 0
+#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__
+  // endianess conversion. this is redundant on little-endian targets
+  for (i = 0; i < 25; i++) {
+    v = (uint8_t *)&st[i];
+    t = st[i];
+    v[0] = t & 0xFF;
+    v[1] = (t >> 8) & 0xFF;
+    v[2] = (t >> 16) & 0xFF;
+    v[3] = (t >> 24) & 0xFF;
+    v[4] = (t >> 32) & 0xFF;
+    v[5] = (t >> 40) & 0xFF;
+    v[6] = (t >> 48) & 0xFF;
+    v[7] = (t >> 56) & 0xFF;
+  }
+#endif
+#endif
+}
+
+// Initialize the context for SHA3
+
+__device__ int sha3_init_cu(sha3_ctx_t *c, int mdlen) {
+  int i;
+
+  for (i = 0; i < 25; i++)
+    c->st.q[i] = 0;
+  c->mdlen = mdlen;
+  c->rsiz = 200 - 2 * mdlen;
+  c->pt = 0;
+
+  return 1;
+}
+
+// update state with more data
+
+__device__ int sha3_update_cu(sha3_ctx_t *c, const void *data, size_t len) {
+  size_t i;
+  int j;
+
+  j = c->pt;
+  for (i = 0; i < len; i++) {
+    c->st.b[j++] ^= ((const uint8_t *)data)[i];
+    if (j >= c->rsiz) {
+      sha3_keccakf_cu(c->st.q);
+      j = 0;
+    }
+  }
+  c->pt = j;
+
+  return 1;
+}
+
+// finalize and output a hash
+
+__device__ int sha3_final_cu(void *md, sha3_ctx_t *c) {
+  int i;
+
+  c->st.b[c->pt] ^= 0x06;
+  c->st.b[c->rsiz - 1] ^= 0x80;
+  sha3_keccakf_cu(c->st.q);
+
+  for (i = 0; i < c->mdlen; i++) {
+    ((uint8_t *)md)[i] = c->st.b[i];
+  }
+
+  return 1;
+}
+
+// compute a SHA-3 hash (md) of given byte length from "in"
+
+__device__ void *sha3_cu(const void *in, size_t inlen, void *md, int mdlen) {
+  sha3_ctx_t sha3;
+
+  sha3_init_cu(&sha3, mdlen);
+  sha3_update_cu(&sha3, in, inlen);
+  sha3_final_cu(md, &sha3);
+
+  return md;
+}
diff --git a/cuda/sha3.h b/cuda/sha3.h
new file mode 100644
index 0000000..6d1080b
--- /dev/null
+++ b/cuda/sha3.h
@@ -0,0 +1,39 @@
+// sha3.h
+// 19-Nov-11  Markku-Juhani O. Saarinen <mjos@iki.fi>
+
+#ifndef SHA3_CU_H
+#define SHA3_CU_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifndef KECCAKF_ROUNDS
+#define KECCAKF_ROUNDS 24
+#endif
+
+#ifndef ROTL64
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+#endif
+
+// state context
+typedef struct {
+  union {           // state:
+    uint8_t b[200]; // 8-bit bytes
+    uint64_t q[25]; // 64-bit words
+  } st;
+  int pt, rsiz, mdlen; // these don't overflow
+} sha3_ctx_t;
+
+// Compression function.
+__device__ void sha3_keccakf(uint64_t st[25]);
+
+// OpenSSL - like interfece
+__device__ int sha3_init_cu(sha3_ctx_t *c,
+                            int mdlen); // mdlen = hash output in bytes
+__device__ int sha3_update_cu(sha3_ctx_t *c, const void *data, size_t len);
+__device__ int sha3_final_cu(void *md, sha3_ctx_t *c); // digest goes to md
+
+// compute a sha3 hash (md) of given byte length from "in"
+__device__ void *sha3_cu(const void *in, size_t inlen, void *md, int mdlen);
+
+#endif
diff --git a/cuda_stubs.go b/cuda_stubs.go
new file mode 100644
index 0000000..5bacc66
--- /dev/null
+++ b/cuda_stubs.go
@@ -0,0 +1,19 @@
+// +build !cuda
+// Copyright 2019 cruzbit developers
+// Use of this source code is governed by a MIT-style license that can be found in the LICENSE file.
+
+package cruzbit
+
+const CUDA_ENABLED = false
+
+func CudaInit() int {
+	return 0
+}
+
+func CudaMinerUpdate(minerNum int, headerBytes []byte, headerBytesLen, startNonceOffset, endNonceOffset int, target BlockID) int64 {
+	return 0
+}
+
+func CudaMinerMine(minerNum int, startNonce int64) int64 {
+	return 0
+}
diff --git a/miner.go b/miner.go
index 10be758..f274dec 100644
--- a/miner.go
+++ b/miner.go
@@ -212,8 +212,8 @@ func (m *Miner) run() {
 			}
 
 			// hash the block and check the proof-of-work
-			hashes++
-			idInt := block.Header.IDFast()
+			idInt, attempts := block.Header.IDFast(m.num)
+			hashes += attempts
 			if idInt.Cmp(targetInt) <= 0 {
 				// found a solution
 				id := new(BlockID).SetBigInt(idInt)
@@ -291,7 +291,7 @@ func (h *HashrateMonitor) run() {
 	defer h.wg.Done()
 
 	var totalHashes int64
-	updateInterval := 5 * time.Minute
+	updateInterval := 1 * time.Minute
 	ticker := time.NewTicker(updateInterval)
 	defer ticker.Stop()