In [1]:
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray

BLOCK_SIZE = 32

# cuda blocks and grids are set to cover all indexes. So there are surplus processes that should not be performed. This function confirms that there are enough processes that cover all necessary indexes. 
def cuda_cover_test():
    n = 35
    ni = np.int32(n)
    x = np.empty([n,n]).astype(np.float32)

    # allocate memory on device
    x_gpu = cuda.mem_alloc(x.nbytes)

    # compile kernel
    mod = SourceModule(open("kernels.cu", "r").read())

    # get function
    cuda_cover_test = mod.get_function("cuda_cover_test");

    # set grid size
    n1 = 10
    n2 = 15
    if n2%BLOCK_SIZE != 0 and n1%BLOCK_SIZE != 0:
        grid=(n2//BLOCK_SIZE+1,n1//BLOCK_SIZE+1,1)
    elif n2%BLOCK_SIZE == 0 and n1%BLOCK_SIZE != 0:
        grid=(n2//BLOCK_SIZE,n1//BLOCK_SIZE+1,1)
    elif n2%BLOCK_SIZE != 0 and n1%BLOCK_SIZE == 0:
        grid=(n2//BLOCK_SIZE+1,n1//BLOCK_SIZE,1)
    else:
        grid=(n2//BLOCK_SIZE,n1//BLOCK_SIZE,1)

    # print('grid:', grid)   # must be integers

    # call gpu function
    cuda_cover_test(x_gpu, ni, block=(BLOCK_SIZE,BLOCK_SIZE,1), grid=grid)
    cuda.Context.synchronize()

    # copy back the result
    cuda.memcpy_dtoh(x, x_gpu)

    return x


x_test = cuda_cover_test()

print(x_test.shape)
print(x_test)
print(x_test[0:32,0:32])
print(x_test[0:33,0:33])
# The indexes in the cuda function covers the size of block! (not the predefined n1 or n2)

(35, 35)
[[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]
[[1. 1. 1. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 ...
 [1. 1. 1. ... 1. 1. 0.]
 [1. 1. 1. ... 1. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
