In [36]:
import numba

a = numba.cuda.device_array(shape=0)
print(type(a))

<class 'numba.cuda.cudadrv.devicearray.DeviceNDArray'>


In [3]:
import numpy as np
import numba.cuda.random as curand
from numba import cuda

@cuda.jit
def init_random_gpu_jit(N, rng_states, out):
    ind_start = cuda.grid(ndim=1) # type: ignore - Suppress Pylance argument typingctx error
    ind_stride = cuda.gridsize(ndim=1) # type: ignore - Suppress Pylance argument typingctx error
    for ind in range(ind_start, rng_states.shape[0], ind_stride):
        for i in range(N):
            out[ind, i] = curand.xoroshiro128p_uniform_float32(rng_states, ind)

def init_random(N: int, seed0: int, seed1: int) -> np.ndarray:
    seeds = np.arange(start=seed0, stop=seed1, dtype=np.int32)
    num_seeds = seeds.size
    data = cuda.device_array((num_seeds, N), dtype=np.float32)
    per_block_threads_grid = 4
    blocks_grid = (num_seeds + per_block_threads_grid - 1) // per_block_threads_grid

    # Initialize RNG states
    rng_states = curand.create_xoroshiro128p_states(num_seeds, seed=0)
    for i in range(num_seeds):
        curand.init_xoroshiro128p_states(rng_states[i:i+1], seeds[i])

    # Launch the CUDA kernel
    init_random_gpu_jit[blocks_grid, per_block_threads_grid](N, rng_states, data) # type: ignore - Suppress Pylance Object of type "(N: Unknown, rng_states: Unknown, out: Unknown) -> None" is not subscriptable

    return data

# Usage example
num_values = 10000  # Number of random values per seed
num_seeds_start = 0
num_seeds_end = 1000

# Initialize random values
data = init_random(num_values, num_seeds_start, num_seeds_end)

# Print the data
print(data.copy_to_host()) 


[[0.7666216  0.8435221  0.67347515 ... 0.30960736 0.89397955 0.3713509 ]
 [0.13312314 0.3780597  0.2305517  ... 0.31311917 0.8713214  0.7731694 ]
 [0.18237947 0.4034984  0.29253274 ... 0.6022286  0.06064058 0.11549199]
 ...
 [0.09955441 0.33017534 0.56993085 ... 0.70994323 0.70531285 0.5992914 ]
 [0.6272834  0.7467063  0.7721667  ... 0.37673646 0.35704595 0.9751529 ]
 [0.8169655  0.8054853  0.36733994 ... 0.04749916 0.856052   0.72720164]]


In [None]:
import numpy as np
from numba import njit, cuda


class HigherArray:
    def __init__(self, dtype: type =np.float32, is_host: bool = True) -> None:
        self.shape: tuple[int]|tuple[int,int] = (0,)
        self.dtype = dtype
        self.is_host = is_host
        self.data = None
        pass

    @staticmethod
    @cuda.jit
    def init_random_gpu_jit(N, rng_states, out):
        ind_start = cuda.grid(ndim=1) # type: ignore - Suppress Pylance argument typingctx error
        ind_stride = cuda.gridsize(ndim=1) # type: ignore - Suppress Pylance argument typingctx error
        for ind in range(ind_start, rng_states.shape[0], ind_stride):
            for i in range(N):
                out[ind, i] = curand.xoroshiro128p_uniform_float32(rng_states, ind)

    @njit
    def init_random_cpu_jit(self, N: int, seeds: np.ndarray) -> None:
        self.data = np.empty(shape=(seeds.size, N), dtype=np.float32)
        for ind_seed, seed in enumerate(seeds):
            self.data[ind_seed] = np.random.default_rng(seed).random(size=N, dtype=np.float32)

    def init_random_py(self, N: int, seeds: np.ndarray) -> None:
        self.data = np.empty(shape=(seeds.size, N), dtype=np.float32)
        for ind_seed, seed in enumerate(seeds):
            self.data[ind_seed] = np.random.default_rng(seed).random(size=N, dtype=np.float32)

    def init_random(self, N:int, seed0: np.int32, seed1: np.int32) -> None:
        seeds = np.arange(start=seed0, stop=seed1,dtype=np.int32)
        if self.is_host:
            self.init_random_cpu_jit(N=N,seeds=seeds)
        else:
            num_seeds = seeds.size
            self.data = cuda.device_array(shape=(num_seeds, N), dtype=np.float32)

            # Initialize RNG states
            rng_states = curand.create_xoroshiro128p_states(num_seeds, seed=0)
            for i in range(num_seeds):
                curand.init_xoroshiro128p_states(rng_states[i:i+1], seeds[i])

            # Launch the CUDA kernel
            per_block_threads_grid = 32 # Multiple of 32 below 513
            blocks_grid = (num_seeds + per_block_threads_grid - 1) // per_block_threads_grid
            HigherArray.init_random_gpu_jit[blocks_grid, per_block_threads_grid](N, rng_states, self.data) # type: ignore - Suppress Pylance Object of type "(N: Unknown, rng_states: Unknown, out: Unknown) -> None" is not subscriptable
