low and high-level versions of add in CUDA

cliburn · cliburn · commit d66505092094 · 2014-08-05T18:31:21.000-04:00
diff --git a/cuda/cuda_add.py b/cuda/cuda_add.py
@@ -0,0 +1,30 @@
+from numbapro import cuda, void, float32
+import numpy as np
+
+@cuda.jit('void(float32[:], float32[:], float32[:])')
+def cu_add(a, b, c):
+    # i = cuda.grid(1)
+    tx = cuda.threadIdx.x
+    bx = cuda.blockIdx.x
+    bw = cuda.blockDim.x
+    i = tx + bx * bw
+
+    if i  > c.size:
+        return
+    c[i] = a[i] + b[i]
+
+if __name__ == '__main__':
+    gpu = cuda.get_current_device()
+
+    n = 100
+    a = np.arange(n, dtype=np.float32)
+    b = np.arange(n, dtype=np.float32)
+    c = np.empty_like(a)
+
+    nthreads = gpu.WARP_SIZE
+    nblocks = int(np.ceil(float(n)/nthreads))
+    print 'Blocks per grid:', nblocks
+    print 'Threads per block', nthreads
+
+    cu_add[nblocks, nthreads](a, b, c)
+    print c
diff --git a/cuda/vect_add.py b/cuda/vect_add.py
@@ -0,0 +1,18 @@
+from numbapro import cuda, vectorize, guvectorize
+from numbapro import void, int64, float32, float64
+import numpy as np
+
+@vectorize(['int64(int64, int64)', 
+            'float64(float32, float32)',
+            'float64(float64, float64)'], 
+           target='gpu')
+def cu_add(a, b):
+    return a + b
+
+if __name__ == '__main__':
+    n = 100
+    A = np.arange(n)
+    B = np.arange(n)
+    C = cu_add(A, B)
+    print C
+