In [1]:
from brownpy.gpu_sim import Universe
import brownpy.topology as Top
import matplotlib.pyplot as plt
import numpy as np
from tqdm.auto import tqdm
plt.style.use('dark_background')

In [2]:
dt = int(1E6) #fs (1ns) - time steps
D = 1.5E-4 # A²/fs  (1.5E-9 m²/s) - Diffusion coefficient

# Geometry
L = 1E4 # A (1um) - periodic size

N= 4*1024

In [3]:
# top = Top.Periodic(L=L)
top = Top.Infinite()
u = Universe(N=N, top=top, D=D, dt=dt,
             output_path='periodic')

periodic.hdf5 already exists, change output filename for periodic_20.hdf5


In [4]:
u.run(1_000); #warmup

100%|██████████| 1000/1000 [00:00<00:00, 1284.14it/s, total=1 µs]

With 4096 particles
------------------------------------------
GPU time per step and per particles:
Allocation: 500 fs
Engine: 522 ps
Transfert to RAM: 500 fs
Total: 523 ps
------------------------------------------
CPU time per step and per particles:
Total: 191 ns
------------------------------------------
For a timestep of 1 ns
To simulate the trajectory of 1 particle during 1 s, we need 191  s





In [22]:
value = list(u.engine.inspect_llvm().keys())[0]
print(u.engine.inspect_llvm()[value])

source_filename = "<string>"
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

@"_ZN6cudapy7brownpy7gpu_sim8Universe8_compile12$3clocals$3e10engine$247E5ArrayIfLi2E1C7mutable7alignedExxf5ArrayIjLi2E1C7mutable7alignedE5ArrayI6RecordILi854EELi1E1C7mutable7alignedE5ArrayIfLi3E1C7mutable7alignedEx5ArrayIjLi2E1C7mutable7alignedE__errcode__" = global i32 0
@"_ZN6cudapy7brownpy7gpu_sim8Universe8_compile12$3clocals$3e10engine$247E5ArrayIfLi2E1C7mutable7alignedExxf5ArrayIjLi2E1C7mutable7alignedE5ArrayI6RecordILi854EELi1E1C7mutable7alignedE5ArrayIfLi3E1C7mutable7alignedEx5ArrayIjLi2E1C7mutable7alignedE__tidx__" = global i32 0
@"_ZN6cudapy7brownpy7gpu_sim8Universe8_compile12$3clocals$3e10engine$247E5ArrayIfLi2E1C7mutable7alignedExxf5ArrayIjLi2E1C7mutable7alignedE5ArrayI6RecordILi854EELi1E1C7mutable7alignedE5ArrayIfLi3E1C7mutable7alignedEx5ArrayIjLi2E1C7mutab

In [5]:
u.run(10_000_000);

100%|██████████| 10000000/10000000 [00:09<00:00, 1090646.80it/s, total=10 ms]

With 4096 particles
------------------------------------------
GPU time per step and per particles:
Allocation: 3.89 fs
Engine: 218 ps
Transfert to RAM: 4.47 fs
Total: 218 ps
------------------------------------------
CPU time per step and per particles:
Total: 224 ps
------------------------------------------
For a timestep of 1 ns
To simulate the trajectory of 1 particle during 1 s, we need 224 ms





In [5]:
u.run(10_000_000);

100%|██████████| 10000000/10000000 [00:00<00:00, 10202458.14it/s, total=10 ms]

With 4096 particles
------------------------------------------
GPU time per step and per particles:
Allocation: 3.61 fs
Engine: 18.5 ps
Transfert to RAM: 4.25 fs
Total: 18.5 ps
------------------------------------------
CPU time per step and per particles:
Total: 24 ps
------------------------------------------
For a timestep of 1 ns
To simulate the trajectory of 1 particle during 1 s, we need 24 ms





In [1]:
import numpy as np
from numba import cuda
import numba as nb
from numba.cuda.random import (create_xoroshiro128p_states,
                               xoroshiro128p_normal_float32)
import math, time


In [2]:
N = 4*1024
r0 = np.random.uniform(size=(N))
d_r0 = cuda.to_device(r0.astype(np.float32))
N_steps = 1_000_000
freq_dumps = 0
trajectory = cuda.device_array(shape=(N,0))

rng_states = create_xoroshiro128p_states(N, seed=0)

threadsperblock = 128
blockspergrid = math.ceil(N / threadsperblock)

In [42]:
@cuda.jit
def test_kernel(r0, freq_dumps, N_steps, rng_states, trajectory):
    pos = cuda.grid(1)
    i_dump = 0
    if pos < r0.shape[0]:
        x0 = r0[pos]
        for step in range(N_steps):
            x1 = x0 + xoroshiro128p_normal_float32(rng_states, pos)
            if freq_dumps !=0:
                if (step + 1)%freq_dumps == 0:
                    trajectory[pos, i_dump] = x1
                    i_dump += 1
            x0=x1

dt_cpu = []
for i in range(4):
    t0_cpu = time.perf_counter()
    test_kernel[blockspergrid, threadsperblock](r0, freq_dumps, N_steps, rng_states, trajectory)
    cuda.synchronize()
    t1_cpu = time.perf_counter()
    dt_cpu.append(t1_cpu-t0_cpu)
print(f'{np.mean(dt_cpu[1:])/N/N_steps*1E9:.3e} ns/step/particles')

1.373e-01 ns/step/particles


In [41]:
@cuda.jit
def test_kernel2(r0, freq_dumps, N_steps, rng_states, trajectory):
    pos = cuda.grid(1)
    i_dump = 0
    freq_dumps = 0
    if pos < r0.shape[0]:
        x0 = r0[pos]
        for step in range(N_steps):
            x1 = x0 + xoroshiro128p_normal_float32(rng_states, pos)
            if freq_dumps !=0:
                if (step + 1)%freq_dumps == 0:
                    trajectory[pos, i_dump] = x1
                    i_dump += 1
            x0=x1

dt_cpu = []
for i in range(4):
    t0_cpu = time.perf_counter()
    test_kernel2[blockspergrid, threadsperblock](r0, freq_dumps, N_steps, rng_states, trajectory)
    cuda.synchronize()
    t1_cpu = time.perf_counter()
    dt_cpu.append(t1_cpu-t0_cpu)
print(f'{np.mean(dt_cpu[1:])/N/N_steps*1E9:.3e} ns/step/particles')

9.469e-03 ns/step/particles


In [38]:
freq_dumps = 0
@cuda.jit
def test_kernel3(r0, N_steps, rng_states, trajectory):
    pos = cuda.grid(1)
    i_dump = 0
    if pos < r0.shape[0]:
        x0 = r0[pos]
        for step in range(N_steps):
            x1 = x0 + xoroshiro128p_normal_float32(rng_states, pos)
            if freq_dumps !=0:
                if (step + 1)%freq_dumps == 0:
                    trajectory[pos, i_dump] = x1
                    i_dump += 1
            x0=x1

dt_cpu = []
for i in range(4):
    t0_cpu = time.perf_counter()
    test_kernel3[blockspergrid, threadsperblock](r0, N_steps, rng_states, trajectory)
    cuda.synchronize()
    t1_cpu = time.perf_counter()
    dt_cpu.append(t1_cpu-t0_cpu)
print(f'{np.mean(dt_cpu[1:])/N/N_steps*1E9:.3e} ns/step/particles')

9.237e-03 ns/step/particles


In [None]:
print(list(test_kernel.inspect_llvm().items())[0][1])

# CUPY

In [43]:
import cupy as cp

In [48]:
freq_dumps = 0
@cuda.jit
def test_kernel4(r0, dr, N_steps, rng_states, trajectory):
    pos = cuda.grid(1)
    i_dump = 0
    if pos < r0.shape[0]:
        x0 = r0[pos]
        for step in range(N_steps):
            x1 = x0 + dr[pos, step]
            if freq_dumps !=0:
                if (step + 1)%freq_dumps == 0:
                    trajectory[pos, i_dump] = x1
                    i_dump += 1
            x0=x1

dt_cpu = []
for i in range(4):
    t0_cpu = time.perf_counter()
    Nchunck = 100
    sub_N_steps = N_steps//Nchunck
    for i in range(Nchunck):
        dr = cp.random.normal(size=(N, sub_N_steps), dtype=np.float32)
        test_kernel4[blockspergrid, threadsperblock](r0, dr, sub_N_steps, rng_states, trajectory)
    cuda.synchronize()
    t1_cpu = time.perf_counter()
    dt_cpu.append(t1_cpu-t0_cpu)
print(f'{np.mean(dt_cpu[1:])/N/N_steps*1E9:.3e} ns/step/particles')

4.651e-02 ns/step/particles


In [54]:
loaded_from_source = r'''
extern "C"{
__host__ void test() {
    int m_numSims =  1024*2*1000;
    cudaResult = cudaMalloc((void **)&d_points, 2 * m_numSims * sizeof(float));
    curandStatus_t curandResult;
    curandGenerator_t prng;
    curandResult = curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
}

}'''

In [3]:
import ctypes as ct
import numpy as np
N=1024
N_steps = 10_000_000
# E = ctypes.cdll.LoadLibrary("C:\\Users\\monet\\Documents\\Code\\Python-cuda\\2-curand\\randGen.dll")

def get_cuda_run():
      dll = ct.windll.LoadLibrary("C:\\Users\\monet\\Documents\\Code\\Python-cuda\\2-curand\\randGen.dll")
      func = dll.run
      func.argtypes = [ct.c_int, ct.c_int, ct.POINTER(ct.c_float)] 
      return func
__run = get_cuda_run()
def cuda_run(N_steps:int, N_particles:int, d_points):
      p_d_points = d_points.ctypes.data_as(ct.POINTER(ct.c_float))
      __run(N_steps, N_particles, p_d_points)
d_points = np.ones((N), dtype=np.float32)
cuda_run(N_steps, N, d_points)
# dll.run(N_steps,
#       N,
#       ct.c_void_p(d_points.ctypes.data))

In [2]:
d_points

array([-0.27442148,  0.05883163,  2.2392263 , ...,  0.88487107,
       -0.06198183, -1.155455  ], dtype=float32)

In [2]:
d_points

array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)