In [1]:
import time
import numpy as np
import numba as nb
import randomgen
N_THREADS = nb.get_num_threads()
from numpy.random import PCG64
from timeit import timeit



In [20]:
randomgen.generator.standard_normal(dtype='float32')

-0.6088277101516724

In [51]:
@nb.jit(nopython=True, parallel=True)
def test(Nsteps, out):
    for i in nb.prange(out.shape[0]):
        np.random.seed(i)
        for step in range(Nsteps):
            out[i] += np.random.standard_normal()
            out[i] += np.random.standard_normal()

bit_gen = PCG64()
next_d = bit_gen.cffi.next_double
state_addr = bit_gen.cffi.state_address

@nb.jit(nopython=True, parallel=True)
def test_f32(Nsteps, out):
    for i in nb.prange(out.shape[0]):
        np.random.seed(i)
        for step in range(Nsteps):
            U1 = np.random.rand()
            U2 = np.random.rand()
            R = np.sqrt(-2 * np.log(U1))
            Theta = 2 * np.pi * U2
            X = R * np.cos(Theta)
            Y = R * np.sin(Theta)
            out[i] += X + Y
            # out[i] += np.random.uniform(0,1)

In [46]:
N_steps = 10_000_000
N_particles = 10


In [47]:
out = np.zeros(N_particles)
test(N_steps, out) # warmup
N_loops = 5
t0 = time.time_ns()
for _ in range(N_loops):
    test(N_steps, out)
delta_t = (time.time_ns() - t0)/N_loops
print(f'{delta_t/N_steps/N_particles*N_THREADS:.2f} ns/dt/p/cores')

175.72 ns/dt/p/cores


In [52]:
out = np.zeros(N_particles, dtype='float32')
test_f32(N_steps, out) # warmup
N_loops = 5
t0 = time.time_ns()
for _ in range(N_loops):
    test_f32(N_steps, out)
delta_t = (time.time_ns() - t0)/N_loops
print(f'{delta_t/N_steps/N_particles*N_THREADS:.2f} ns/dt/p/cores')

61.34 ns/dt/p/cores


In [None]:
randomgen.generator.standard_normal()

In [52]:
test.parallel_diagnostics(level=1)

 
 Parallel Accelerator Optimizing:  Function test, <ipython-
input-42-b98cb058667e> (1)  


Parallel loop listing for  Function test, <ipython-input-42-b98cb058667e> (1) 
------------------------------------------------|loop #ID
@nb.jit(nopython=True, parallel=True)           | 
def test(Nsteps, out):                          | 
    for i in nb.prange(out.shape[0]):-----------| #4
        np.random.seed(i)                       | 
        for step in range(Nsteps):              | 
            out[i] += np.random.uniform(0,1)    | 
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 


In [57]:
import randomgen

ModuleNotFoundError: No module named 'randomgen'

In [54]:
from numba import cuda

In [55]:
cuda.detect()

Found 1 CUDA devices
id 0    b'NVIDIA GeForce GTX 1050 Ti with Max-Q Design'                              [SUPPORTED]
                      compute capability: 6.1
                           pci device id: 0
                              pci bus id: 2
Summary:
	1/1 devices are supported


True

In [56]:
cuda.is_available()

True

In [1]:
from brownpy.settings import set_computation_type
set_computation_type('gpu')
from brownpy import gpu_sim
from brownpy.gpu_sim import Universe
import brownpy.topology as Top
from brownpy.utils import prefix, unwrap
import matplotlib.pyplot as plt
# https://matplotlib.org/stable/gallery/axes_grid1/inset_locator_demo.html
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import h5py
import numpy as np
from threading import Thread
from pathlib import Path
import time
import pandas as pd
from tqdm.notebook import tqdm
plt.style.use('default')
import cupy as cp

In [2]:
# Diffusion coefficient
D = 1.5E-4 # A²/fs  (1.5E-9 m²/s) - Diffusion coefficient

"""
┃         ┃   ┃         ┃      ↑  
┃         ┃   ┃         ┃      │
┃         ┗━━━┛         ┃      │
┃                       ┃ ↕ Hc │ 2 H
┃         ┏━━━┓         ┃      │
┃         ┃   ┃         ┃      │
┃         ┃   ┃         ┃      ↓
 ←-------→ ←-→ ←-------→
     L      Lc     L
"""
# Conversion into my notation
Hc = 1E2 # A (10nm) - Channel width
L = 250*Hc # Channel depth
H_factor = 1
H = H_factor*Hc # Distance between channel
ar_factor = 100
Lc = ar_factor*Hc # Channel length

# Timestep 
dt = int(0.05*Hc**2/D)
N = 2*1024
print(f'dt = {dt:.2e} fs = {prefix(dt*1E-15)}s')

Nsteps = int(1.5*1E6/0.05) 
print(f"Number of steps : {Nsteps:.2e} = {prefix(dt*Nsteps*1E-15)}s")


top = Top.ElasticChannel2(Hc=Hc, Lc=Lc, 
                          H=H, L=L)
u=Universe(N=N, top=top, D=D, dt=dt,
           output_path=f'./4/channel/{H_factor}_{ar_factor}',
           overwrite=True)

dt = 3.33e+06 fs = 3.33 ns
Number of steps : 3.00e+07 = 100 ms


In [4]:
#GPU
u.run(Nsteps//100)

100%|██████████| 300000/300000 [00:03<00:00, 99334.95it/s, total=2 ms] 

With 2048 particles
------------------------------------------
GPU time per step and per particles:
Allocation: 49.6 fs
Engine: 4.8 ns
Transfert to RAM: 3.21 ps
Total: 4.8 ns
------------------------------------------
CPU time per step and per particles:
Total: 4.92 ns
------------------------------------------
For a timestep of 3.33 ns
To simulate the trajectory of 1 particle during 1 s, we need 1.48  s





In [5]:
#GPU
u.run(Nsteps//100, freq_dumps=100)

100%|██████████| 300000/300000 [00:02<00:00, 100366.95it/s, total=3 ms]

With 2048 particles
------------------------------------------
GPU time per step and per particles:
Allocation: 39.9 fs
Engine: 4.66 ns
Transfert to RAM: 34.5 ps
Total: 4.69 ns
------------------------------------------
CPU time per step and per particles:
Total: 4.87 ns
------------------------------------------
For a timestep of 3.33 ns
To simulate the trajectory of 1 particle during 1 s, we need 1.46  s





In [4]:
#CPU
u.run(Nsteps//100)

100%|██████████| 300000/300000 [00:14<00:00, 20960.32it/s, total=2 ms]

With 2048 particles
------------------------------------------
CPU time per step and per particles:
Total: 23.3 ns
------------------------------------------
For a timestep of 3.33 ns
To simulate the trajectory of 1 particle during 1 s, we need 6.99  s





In [6]:
#CPU
u.run(10000, freq_dumps=100)

100%|██████████| 10000/10000 [00:00<00:00, 15250.23it/s, total=2.04 ms]

With 2048 particles
------------------------------------------
CPU time per step and per particles:
Total: 32.2 ns
------------------------------------------
For a timestep of 3.33 ns
To simulate the trajectory of 1 particle during 1 s, we need 9.65  s





In [5]:
data=u[0]

Reading trajectory ...
... Done
left  read
right  read


In [7]:
data['trajectory'].shape

(2048, 2, 10)

In [4]:
f = h5py.File(u.output_path, 'r')

In [8]:
f['run/0/regions/left']

<HDF5 dataset "left": shape (300000,), type "<u2">

In [1]:
gpu_sim._GPU_COMPUTATION = False

NameError: name 'gpu_sim' is not defined

In [7]:
Universe._GPU_COMPUTATION

AttributeError: type object 'Universe' has no attribute '_GPU_COMPUTATION'

In [5]:
_GPU_COMPUTATION

NameError: name '_GPU_COMPUTATION' is not defined