In [1]:
#References
#https://shreeraman-ak.medium.com/parallel-reduction-with-cuda-d0ae10c1ae2c
#https://www.youtube.com/watch?v=bpbit8SPMxU


import math
from numba import njit, prange, cuda
import numpy as np
import random
import time
from numba.cuda.random import (create_xoroshiro128p_states,xoroshiro128p_uniform_float32)


import numpy as np
from numba import cuda, float32
import time

# CUDA kernel to generate a 1D array with random values
@cuda.jit
def random_1d(arr, rng_states):
    idx = cuda.grid(1)
    if idx < arr.shape[0]:
        arr[idx] = xoroshiro128p_uniform_float32(rng_states, idx)


def sum_brute_force(arr):
    result = 0
    for element in arr:
        result += element
    return result


# Function to perform sum reduction on a 1D array
@cuda.jit
def sum_reduction_1d_kernel(arr, result):
    # Allocate shared memory for each block
    shared_arr = cuda.shared.array(shape=(256,), dtype=float32)

    idx = cuda.grid(1)
    tid = cuda.threadIdx.x
    bid = cuda.blockIdx.x
    bdim = cuda.blockDim.x

    # Copy data from global to shared memory
    if idx < arr.shape[0]:
        shared_arr[tid] = arr[idx]
    else:
        shared_arr[tid] = 0
    cuda.syncthreads()

    # Perform reduction in shared memory
    for s in range(bdim // 2):
        if tid < bdim // 2:
            shared_arr[tid] += shared_arr[tid + bdim // 2]
        cuda.syncthreads()

    # Store the result in global memory
    if tid == 0:
        result[bid] = shared_arr[0]

# Function to perform sum reduction on a 1D array
def sum_reduction_1d(arr):
    # Define block and grid dimensions
    block_dim = 256
    grid_dim = (arr.shape[0] + block_dim - 1) // block_dim

    # Allocate memory on GPU for result
    result_gpu = np.zeros(grid_dim, dtype=np.float32)

    # Copy input array to GPU
    arr_gpu = cuda.to_device(arr)

    # Perform sum reduction on GPU
    sum_reduction_1d_kernel[grid_dim, block_dim](arr_gpu, result_gpu)
    cuda.synchronize()  # Ensure all threads have finished execution

    # Compute final sum directly from the GPU result
    final_sum = np.sum(result_gpu)

    return final_sum

# Generate a random 1D array
size = 100000000
arr = np.empty(size, dtype=np.float32)

# Allocate device memory for random number generation
rng_states = create_xoroshiro128p_states(size, seed=1)

# Define block and grid dimensions
block_dim = 256
grid_dim = (size + block_dim - 1) // block_dim

# Generate random numbers on GPU
random_1d[grid_dim, block_dim](arr, rng_states)

# Perform sum reduction on the generated array
start = time.time()
result = sum_reduction_1d(arr)
end = time.time()

time_diff_cuda = end - start

print("Time taken for sum reduction with CUDA: {:.6f} seconds".format(time_diff_cuda))

arr = np.random.rand(size).astype(np.float32)

# Perform sum calculation using brute-force method
start = time.time()
result_brute_force = sum_brute_force(arr)
end = time.time()

time_diff_brute_force = end - start
print("Time taken for sum calculation with brute-force method: {:.6f} seconds".format(time_diff_brute_force))


CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
: