diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py index 33d2d921e..f08775b79 100644 --- a/fbpic/boundaries/moving_window.py +++ b/fbpic/boundaries/moving_window.py @@ -9,6 +9,7 @@ from scipy.constants import c from fbpic.particles import Particles from fbpic.lpa_utils.boosted_frame import BoostConverter +from fbpic.threading_utils import njit_parallel, prange # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed if cuda_installed: @@ -318,86 +319,36 @@ def shift_spect_grid( self, grid, n_move, """ if grid.use_cuda: shift = grid.d_field_shift + # Get a 2D CUDA grid of the size of the grid + tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] ) # Shift all the fields on the GPU - self.shift_spect_field_gpu( grid.Ep, shift, n_move ) - self.shift_spect_field_gpu( grid.Em, shift, n_move ) - self.shift_spect_field_gpu( grid.Ez, shift, n_move ) - self.shift_spect_field_gpu( grid.Bp, shift, n_move ) - self.shift_spect_field_gpu( grid.Bm, shift, n_move ) - self.shift_spect_field_gpu( grid.Bz, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move ) if shift_rho: - self.shift_spect_field_gpu( grid.rho_prev, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move ) if shift_currents: - self.shift_spect_field_gpu( grid.Jp, shift, n_move ) - self.shift_spect_field_gpu( grid.Jm, shift, n_move ) - self.shift_spect_field_gpu( grid.Jz, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move ) else: shift = grid.field_shift # Shift all the fields on the CPU - self.shift_spect_field( grid.Ep, shift, n_move ) - self.shift_spect_field( grid.Em, shift, n_move ) - self.shift_spect_field( grid.Ez, shift, n_move ) - self.shift_spect_field( grid.Bp, shift, n_move ) - self.shift_spect_field( grid.Bm, shift, n_move ) - self.shift_spect_field( grid.Bz, shift, n_move ) + shift_spect_array_cpu( grid.Ep, shift, n_move ) + shift_spect_array_cpu( grid.Em, shift, n_move ) + shift_spect_array_cpu( grid.Ez, shift, n_move ) + shift_spect_array_cpu( grid.Bp, shift, n_move ) + shift_spect_array_cpu( grid.Bm, shift, n_move ) + shift_spect_array_cpu( grid.Bz, shift, n_move ) if shift_rho: - self.shift_spect_field( grid.rho_prev, shift, n_move ) + shift_spect_array_cpu( grid.rho_prev, shift, n_move ) if shift_currents: - self.shift_spect_field( grid.Jp, shift, n_move ) - self.shift_spect_field( grid.Jm, shift, n_move ) - self.shift_spect_field( grid.Jz, shift, n_move ) - - def shift_spect_field( self, field_array, shift_factor, n_move ): - """ - Shift the field 'field_array' by n_move cells. - This is done in spectral space and corresponds to multiplying the - fields with the factor exp(i*kz_true*dz)**n_move . - (Typically n_move is positive, and the fields are shifted backwards) - - Parameters - ---------- - field_array: 2darray of complexs - Contains the value of the fields, and is modified by - this function - - shift_factor: 1darray of complexs - Contains the shift array, that is multiplied to the fields in - spectral space to shift them by one cell in spatial space - ( exp(i*kz_true*dz) ) - - n_move: int - The number of cells by which the grid should be shifted - """ - # Multiply with (shift_factor*sign(n_move))**n_move - field_array *= ( shift_factor[:, np.newaxis] )**n_move - - def shift_spect_field_gpu( self, field_array, shift_factor, n_move): - """ - Shift the field 'field_array' by n_move cells on the GPU. - This is done in spectral space and corresponds to multiplying the - fields with the factor exp(i*kz_true*dz)**n_move . - (Typically n_move is positive, and the fields are shifted backwards) - - Parameters - ---------- - field_array: 2darray of complexs - Contains the value of the fields, and is modified by - this function - - shift_factor: 1darray of complexs - Contains the shift array, that is multiplied to the fields in - spectral space to shift them by one cell in spatial space - ( exp(i*kz_true*dz) ) - - n_move: int - The number of cells by which the grid should be shifted - """ - # Get a 2D CUDA grid of the size of the grid - dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d( - field_array.shape[0], field_array.shape[1] ) - # Shift the field array in place - shift_spect_array_gpu[dim_grid_2d, dim_block_2d]( - field_array, shift_factor, n_move) + shift_spect_array_cpu( grid.Jp, shift, n_move ) + shift_spect_array_cpu( grid.Jm, shift, n_move ) + shift_spect_array_cpu( grid.Jz, shift, n_move ) def shift_interp_grid( self, grid, n_move, shift_rho=True, shift_currents=False ): @@ -513,6 +464,39 @@ def shift_interp_field_gpu( self, field_array, n_move): # Return the new shifted field array return( field_array ) +@njit_parallel +def shift_spect_array_cpu( field_array, shift_factor, n_move ): + """ + Shift the field 'field_array' by n_move cells on CPU. + This is done in spectral space and corresponds to multiplying the + fields with the factor exp(i*kz_true*dz)**n_move . + + Parameters + ---------- + field_array: 2darray of complexs + Contains the value of the fields, and is modified by + this function + + shift_factor: 1darray of complexs + Contains the shift array, that is multiplied to the fields in + spectral space to shift them by one cell in spatial space + ( exp(i*kz_true*dz) ) + + n_move: int + The number of cells by which the grid should be shifted + """ + Nz, Nr = field_array.shape + + # Loop over the 2D array (in parallel over z if threading is enabled) + for iz in prange( Nz ): + power_shift = shift_factor[iz] + # Calculate the shift factor (raising to the power n_move) + for i in range(1,n_move): + power_shift *= shift_factor[iz] + # Shift fields backwards + for ir in range( Nr ): + field_array[iz, ir] *= power_shift + if cuda_installed: @cuda.jit('void(complex128[:,:], complex128[:,:], int32)') diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py index 830fec37a..e80652181 100644 --- a/fbpic/fields/numba_methods.py +++ b/fbpic/fields/numba_methods.py @@ -5,21 +5,18 @@ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) It defines the optimized fields methods that use numba on a CPU """ -import numba from scipy.constants import c, epsilon_0, mu_0 c2 = c**2 +from fbpic.threading_utils import njit_parallel, prange -@numba.jit('void(complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - float64[:,:], float64[:,:], float64[:,:], \ - float64, int32, int32)') +@njit_parallel def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, inv_dt, Nz, Nr ): """ Correct the currents in spectral space, using the standard pstad """ - # Loop over the 2D grid - for iz in range(Nz): + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): for ir in range(Nr): # Calculate the intermediate variable F @@ -33,13 +30,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, Jm[iz, ir] += -0.5 * kr[iz, ir] * F Jz[iz, ir] += -1.j * kz[iz, ir] * F -@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - float64[:,:], float64[:,:], float64[:,:], \ - float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \ - int8, int32, int32)') + return + +@njit_parallel def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, @@ -50,8 +43,8 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, See the documentation of SpectralGrid.push_eb_with """ - # Loop over the 2D grid - for iz in range(Nz): + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): for ir in range(Nr): # Save the electric fields, since it is needed for the B push @@ -106,7 +99,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \ + 1.j*kr[iz, ir]*Jm[iz, ir] ) -@numba.jit + return + +@njit_parallel def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, j_corr_coef, T_eb, T_cc, @@ -115,8 +110,8 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, Correct the currents in spectral space, using the assumption of comoving currents """ - # Loop over the 2D grid - for iz in range(Nz): + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): for ir in range(Nr): # Calculate the intermediate variable F @@ -130,7 +125,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, Jm[iz, ir] += -0.5 * kr[iz, ir] * F Jz[iz, ir] += -1.j * kz[iz, ir] * F -@numba.jit + return + +@njit_parallel def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, @@ -207,3 +204,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, + 1.j*kr[iz, ir]*Em_old ) \ + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \ + 1.j*kr[iz, ir]*Jm[iz, ir] ) + + return diff --git a/fbpic/fields/spectral_transform/spectral_transformer.py b/fbpic/fields/spectral_transform/spectral_transformer.py index a9f99222f..af2d2858b 100644 --- a/fbpic/fields/spectral_transform/spectral_transformer.py +++ b/fbpic/fields/spectral_transform/spectral_transformer.py @@ -9,6 +9,7 @@ from .hankel import DHT from .fourier import FFT +from .threading_methods import numba_rt_to_pm, numba_pm_to_rt # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed if cuda_installed: @@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m, self.spect_buffer_r, self.spect_buffer_t ) else : # Combine them on the CPU - # (It is important to write the affectation in the following way, - # since self.spect_buffer_p and self.spect_buffer_r actually point - # to the same object, for memory economy) - self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \ - ( self.spect_buffer_p + self.spect_buffer_m), \ - 1.j*( self.spect_buffer_p - self.spect_buffer_m) + # (self.spect_buffer_r and self.spect_buffer_t are + # passed in the following line, in order to make things + # explicit, but they actually point to the same object + # as self.spect_buffer_p, self.spect_buffer_m, + # for economy of memory) + numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m, + self.spect_buffer_r, self.spect_buffer_t ) # Finally perform the FFT (along axis 0, which corresponds to z) self.fft.inverse_transform( self.spect_buffer_r, interp_array_r ) @@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t, self.spect_buffer_r, self.spect_buffer_t, self.spect_buffer_p, self.spect_buffer_m ) else : - # Combine them on the CPU - # (It is important to write the affectation in the following way, - # since self.spect_buffer_p and self.spect_buffer_r actually point - # to the same object, for memory economy.) - self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \ - 0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \ - 0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t ) + # Combine them on the GPU + # (self.spect_buffer_p and self.spect_buffer_m are + # passed in the following line, in order to make things + # explicit, but they actually point to the same object + # as self.spect_buffer_r, self.spect_buffer_t, + # for economy of memory) + numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t, + self.spect_buffer_p, self.spect_buffer_m ) # Perform the inverse DHT (along axis -1, which corresponds to r) self.dhtp.transform( self.spect_buffer_p, spect_array_p ) diff --git a/fbpic/fields/spectral_transform/threading_methods.py b/fbpic/fields/spectral_transform/threading_methods.py new file mode 100644 index 000000000..ce16dbeef --- /dev/null +++ b/fbpic/fields/spectral_transform/threading_methods.py @@ -0,0 +1,58 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines a set of functions that are useful when converting the +fields from interpolation grid to the spectral grid and vice-versa +""" +from fbpic.threading_utils import prange, njit_parallel + +# ---------------------------------------------------- +# Functions that combine components in spectral space +# ---------------------------------------------------- + +@njit_parallel +def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) : + """ + Combine the arrays buffer_r and buffer_t to produce the + arrays buffer_p and buffer_m, according to the rules of + the Fourier-Hankel decomposition (see associated paper) + """ + Nz, Nr = buffer_r.shape + + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): + for ir in range(Nr): + + # Use intermediate variables, as the arrays + # buffer_r and buffer_t may actually point to the same + # object as buffer_p and buffer_m, for economy of memory + value_r = buffer_r[iz, ir] + value_t = buffer_t[iz, ir] + # Combine the values + buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t ) + buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t ) + + +@njit_parallel +def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) : + """ + Combine the arrays buffer_p and buffer_m to produce the + arrays buffer_r and buffer_t, according to the rules of + the Fourier-Hankel decomposition (see associated paper) + """ + Nz, Nr = buffer_p.shape + + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): + for ir in range(Nr): + + # Use intermediate variables, as the arrays + # buffer_r and buffer_t may actually point to the same + # object as buffer_p and buffer_m, for economy of memory + value_p = buffer_p[iz, ir] + value_m = buffer_m[iz, ir] + # Combine the values + buffer_r[iz, ir] = ( value_p + value_m ) + buffer_t[iz, ir] = 1.j*( value_p - value_m ) diff --git a/fbpic/lpa_utils/laser/antenna.py b/fbpic/lpa_utils/laser/antenna.py index 3f3e8425c..d1aaf10b0 100644 --- a/fbpic/lpa_utils/laser/antenna.py +++ b/fbpic/lpa_utils/laser/antenna.py @@ -10,8 +10,8 @@ from scipy.constants import e, c, epsilon_0, physical_constants r_e = physical_constants['classical electron radius'][0] from .profiles import gaussian_profile -from fbpic.particles.utility_methods import weights -from fbpic.particles.numba_methods import deposit_field_numba +from fbpic.particles.utilities.utility_methods import weights +from fbpic.particles.deposition.numba_methods import deposit_field_numba # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed diff --git a/fbpic/main.py b/fbpic/main.py index f508b0f69..6c3e6c56f 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -10,6 +10,9 @@ # (This needs to be done before the other imports, # as it sets the cuda context) from mpi4py import MPI +import numba +# Check if threading is available +from .threading_utils import threading_enabled # Check if CUDA is available, then import CUDA functions from .cuda_utils import cuda_installed if cuda_installed: @@ -41,12 +44,10 @@ class Simulation(object): def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax, p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0., n_order=-1, dens_func=None, filter_currents=True, - v_comoving=None, use_galilean=False, - initialize_ions=False, use_cuda=False, - n_guard=None, n_damp=30, - exchange_period=None, boundaries='periodic', - gamma_boost=None, use_all_mpi_ranks=True, - particle_shape='linear' ): + v_comoving=None, use_galilean=False, initialize_ions=False, + use_cuda=False, n_guard=None, n_damp=30, exchange_period=None, + boundaries='periodic', gamma_boost=None, + use_all_mpi_ranks=True, particle_shape='linear' ): """ Initializes a simulation, by creating the following structures: @@ -181,17 +182,17 @@ def dens_func( z, r ) ... particle_shape: str, optional Set the particle shape for the charge/current deposition. - Possible values are 'cubic', 'linear' and 'linear_non_atomic'. - While 'cubic' corresponds to third order shapes and 'linear' - to first order shapes, 'linear_non_atomic' uses an equivalent - deposition scheme to 'linear' which avoids atomics on the GPU. + Possible values are 'cubic', 'linear'. ('cubic' corresponds to + third order shapes and 'linear' to first order shapes). """ - # Check whether to use cuda + # Check whether to use CUDA self.use_cuda = use_cuda if (use_cuda==True) and (cuda_installed==False): print('*** Cuda not available for the simulation.') print('*** Performing the simulation on CPU.') self.use_cuda = False + # CPU multi-threading + self.use_threading = threading_enabled # Register the comoving parameters self.v_comoving = v_comoving @@ -212,7 +213,7 @@ def dens_func( z, r ) ... self.comm = BoundaryCommunicator( Nz, zmin, zmax, Nr, rmax, Nm, dt, boundaries, n_order, n_guard, n_damp, exchange_period, use_all_mpi_ranks ) - print_simulation_setup( self.comm, self.use_cuda ) + print_simulation_setup( self.comm, self.use_cuda, self.use_threading ) # Modify domain region zmin, zmax, p_zmin, p_zmax, Nz = \ self.comm.divide_into_domain(zmin, zmax, p_zmin, p_zmax) @@ -234,19 +235,18 @@ def dens_func( z, r ) ... # Initialize the electrons and the ions grid_shape = self.fld.interp[0].Ez.shape self.ptcl = [ - Particles( q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin, - zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, - Nptheta=p_nt, dt=dt, dens_func=dens_func, - use_cuda=self.use_cuda, uz_m=uz_m, - grid_shape=grid_shape, particle_shape=particle_shape) ] + Particles(q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin, + zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, + Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m, + grid_shape=grid_shape, particle_shape=particle_shape, + use_cuda=self.use_cuda ) ] if initialize_ions : self.ptcl.append( Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin, zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, - Nptheta=p_nt, dt=dt, dens_func=dens_func, - use_cuda=self.use_cuda, uz_m=uz_m, - grid_shape=grid_shape, - particle_shape=particle_shape ) ) + Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m, + grid_shape=grid_shape, particle_shape=particle_shape, + use_cuda=self.use_cuda ) ) # Register the number of particles per cell along z, and dt # (Necessary for the moving window) @@ -578,7 +578,7 @@ def progression_bar( i, Ntot, measured_start, Nbars=50, char='-'): sys.stdout.write(', %d:%02d:%02d left' % (h, m, s)) sys.stdout.flush() -def print_simulation_setup( comm, use_cuda ): +def print_simulation_setup( comm, use_cuda, use_threading ): """ Print message about the number of proc and whether it is using GPU or CPU. @@ -590,13 +590,20 @@ def print_simulation_setup( comm, use_cuda ): use_cuda: bool Whether the simulation is set up to use CUDA + + use_threading: bool + Whether the simulation is set up to use threads on CPU """ if comm.rank == 0: if use_cuda: message = "\nRunning FBPIC on GPU " else: message = "\nRunning FBPIC on CPU " - message += "with %d proc.\n" %comm.size + message += "with %d proc" %comm.size + if use_threading and not use_cuda: + message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS + message += ".\n" + print( message ) def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ): diff --git a/fbpic/particles/cuda_deposition/cubic.py b/fbpic/particles/cuda_deposition/cubic.py deleted file mode 100644 index 45e15e0eb..000000000 --- a/fbpic/particles/cuda_deposition/cubic.py +++ /dev/null @@ -1,1214 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for cubic order shapes -""" -from numba import cuda, int64 -import math -from scipy.constants import c -import numpy as np - -# Shape Factor helper functions to compute particle shapes. - - -@cuda.jit(device=True, inline=True) -def z_shape(cell_position, index): - iz = int64(math.floor(cell_position)) - 1 - if index == 0: - return (-1./6.)*((cell_position-iz)-2)**3 - if index == 1: - return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) - if index == 2: - return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) - if index == 3: - return (-1./6.)*(((iz+3)-cell_position)-2)**3 - - -@cuda.jit(device=True, inline=True) -def r_shape(cell_position, index): - flip_factor = 1. - ir = int64(math.floor(cell_position)) - 1 - if index == 0: - if ir < 0: - flip_factor = -1. - return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 - if index == 1: - if ir+1 < 0: - flip_factor = -1. - return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) - if index == 2: - if ir+2 < 0: - flip_factor = -1. - return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) - if index == 3: - if ir+3 < 0: - flip_factor = -1. - return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 - -# ------------------------------- -# Field deposition utility - rho -# ------------------------------- - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - int32[:], int32[:])') -def deposit_rho_gpu_cubic(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho_m0, rho_m1, - cell_idx, prefix_sum): - """ - Deposition of the charge density rho using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of rho that is deposited to the - 16 cells surounding the particle based on its shape (cubic). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 16 variables (one for each surrounding cell) to - maintain parallelism while avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i / Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare local field arrays - R_m0_00 = 0. - R_m1_00 = 0. + 0.j - - R_m0_01 = 0. - R_m1_01 = 0. + 0.j - - R_m0_02 = 0. - R_m1_02 = 0. + 0.j - - R_m0_03 = 0. - R_m1_03 = 0. + 0.j - - R_m0_10 = 0. - R_m1_10 = 0. + 0.j - - R_m0_11 = 0. - R_m1_11 = 0. + 0.j - - R_m0_12 = 0. - R_m1_12 = 0. + 0.j - - R_m0_13 = 0. - R_m1_13 = 0. + 0.j - - R_m0_20 = 0. - R_m1_20 = 0. + 0.j - - R_m0_21 = 0. - R_m1_21 = 0. + 0.j - - R_m0_22 = 0. - R_m1_22 = 0. + 0.j - - R_m0_23 = 0. - R_m1_23 = 0. + 0.j - - R_m0_30 = 0. - R_m1_30 = 0. + 0.j - - R_m0_31 = 0. - R_m1_31 = 0. + 0.j - - R_m0_32 = 0. - R_m1_32 = 0. + 0.j - - R_m0_33 = 0. - R_m1_33 = 0. + 0.j - - for j in range(frequency_per_cell): - # Get the particle index before the sorting - # -------------------------------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0_scal = wj * exptheta_m0 - # Mode 1 - R_m1_scal = wj * exptheta_m1 - # Compute values in local copies and consider boundaries - ir0 = int64(math.floor(r_cell)) - 1 - - if (ir0 == -2): - R_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal - - if (ir0 == -1): - R_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal - R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal - R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal - R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal - R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal - R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal - R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal - R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal - if (ir0 >= 0): - R_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal - R_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal - R_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal - R_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal - R_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal - R_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal - R_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal - R_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal - R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal - R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal - R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal - R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal - R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal - R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal - R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal - - # Index Shifting since local copies are centered around - # the current cell - srl = 0 # shift r lower - sru = 0 # shift r upper inner - sru2 = 0 # shift r upper outer - szl = 0 # shift z lower - szu = 0 # shift z upper inner - szu2 = 0 # shift z upper outer - if (iz_cell-1) < 0: - szl += Nz - if (iz_cell) == (Nz - 1): - szu -= Nz - szu2 -= Nz - if (iz_cell+1) == (Nz - 1): - szu2 -= Nz - if (ir_cell) >= (Nr - 1): - sru = -1 - sru2 = -2 - if (ir_cell+1) == (Nr - 1): - sru2 = -1 - if (ir_cell-1) < 0: - srl = 1 - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag) - - -# ------------------------------- -# Field deposition utility - J -# ------------------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:],\ - int32[:], int32[:])') -def deposit_J_gpu_cubic(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - j_r_m0, j_r_m1, - j_t_m0, j_t_m1, - j_z_m0, j_z_m1, - cell_idx, prefix_sum): - """ - Deposition of the current J using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of J that is deposited to the - 16 cells surounding the particle based on its shape (cubic). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 16 variables (one for each cell) to maintain - parallelism while avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i/Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare the local field value for - # all possible deposition directions, - # depending on the shape order and per mode for r,t and z. - J_r_m0_00 = 0. - J_t_m0_00 = 0. - J_z_m0_00 = 0. - J_r_m1_00 = 0. + 0.j - J_t_m1_00 = 0. + 0.j - J_z_m1_00 = 0. + 0.j - - J_r_m0_01 = 0. - J_t_m0_01 = 0. - J_z_m0_01 = 0. - J_r_m1_01 = 0. + 0.j - J_t_m1_01 = 0. + 0.j - J_z_m1_01 = 0. + 0.j - - J_r_m0_02 = 0. - J_t_m0_02 = 0. - J_z_m0_02 = 0. - J_r_m1_02 = 0. + 0.j - J_t_m1_02 = 0. + 0.j - J_z_m1_02 = 0. + 0.j - - J_r_m0_03 = 0. - J_t_m0_03 = 0. - J_z_m0_03 = 0. - J_r_m1_03 = 0. + 0.j - J_t_m1_03 = 0. + 0.j - J_z_m1_03 = 0. + 0.j - - J_r_m0_10 = 0. - J_t_m0_10 = 0. - J_z_m0_10 = 0. - J_r_m1_10 = 0. + 0.j - J_t_m1_10 = 0. + 0.j - J_z_m1_10 = 0. + 0.j - - J_r_m0_11 = 0. - J_t_m0_11 = 0. - J_z_m0_11 = 0. - J_r_m1_11 = 0. + 0.j - J_t_m1_11 = 0. + 0.j - J_z_m1_11 = 0. + 0.j - - J_r_m0_12 = 0. - J_t_m0_12 = 0. - J_z_m0_12 = 0. - J_r_m1_12 = 0. + 0.j - J_t_m1_12 = 0. + 0.j - J_z_m1_12 = 0. + 0.j - - J_r_m0_13 = 0. - J_t_m0_13 = 0. - J_z_m0_13 = 0. - J_r_m1_13 = 0. + 0.j - J_t_m1_13 = 0. + 0.j - J_z_m1_13 = 0. + 0.j - - J_r_m0_20 = 0. - J_t_m0_20 = 0. - J_z_m0_20 = 0. - J_r_m1_20 = 0. + 0.j - J_t_m1_20 = 0. + 0.j - J_z_m1_20 = 0. + 0.j - - J_r_m0_21 = 0. - J_t_m0_21 = 0. - J_z_m0_21 = 0. - J_r_m1_21 = 0. + 0.j - J_t_m1_21 = 0. + 0.j - J_z_m1_21 = 0. + 0.j - - J_r_m0_22 = 0. - J_t_m0_22 = 0. - J_z_m0_22 = 0. - J_r_m1_22 = 0. + 0.j - J_t_m1_22 = 0. + 0.j - J_z_m1_22 = 0. + 0.j - - J_r_m0_23 = 0. - J_t_m0_23 = 0. - J_z_m0_23 = 0. - J_r_m1_23 = 0. + 0.j - J_t_m1_23 = 0. + 0.j - J_z_m1_23 = 0. + 0.j - - J_r_m0_30 = 0. - J_t_m0_30 = 0. - J_z_m0_30 = 0. - J_r_m1_30 = 0. + 0.j - J_t_m1_30 = 0. + 0.j - J_z_m1_30 = 0. + 0.j - - J_r_m0_31 = 0. - J_t_m0_31 = 0. - J_z_m0_31 = 0. - J_r_m1_31 = 0. + 0.j - J_t_m1_31 = 0. + 0.j - J_z_m1_31 = 0. + 0.j - - J_r_m0_32 = 0. - J_t_m0_32 = 0. - J_z_m0_32 = 0. - J_r_m1_32 = 0. + 0.j - J_t_m1_32 = 0. + 0.j - J_z_m1_32 = 0. + 0.j - - J_r_m0_33 = 0. - J_t_m0_33 = 0. - J_z_m0_33 = 0. - J_r_m1_33 = 0. + 0.j - J_t_m1_33 = 0. + 0.j - J_z_m1_33 = 0. + 0.j - - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index - # ---------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 - J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 - J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 - # Mode 1 - J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 - J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 - J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 - - # Compute values in local copies and consider boundaries - ir0 = int64(math.floor(r_cell)) - 1 - - if (ir0 == -2): - J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal - J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - - J_t_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal - - J_z_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal - if (ir0 == -1): - J_r_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal - - J_t_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal - - J_z_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal - if (ir0 >= 0): - J_r_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal - - J_t_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal - - J_z_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal - - # Index Shifting since local copies are centered around - # the current cell - srl = 0 # shift r lower - sru = 0 # shift r upper inner - sru2 = 0 # shift r upper outer - szl = 0 # shift z lower - szu = 0 # shift z upper inner - szu2 = 0 # shift z upper outer - if (iz_cell-1) < 0: - szl += Nz - if (iz_cell) == (Nz - 1): - szu -= Nz - szu2 -= Nz - if (iz_cell+1) == (Nz - 1): - szu2 -= Nz - if (ir_cell) >= (Nr - 1): - sru = -1 - sru2 = -2 - if (ir_cell+1) == (Nr - 1): - sru2 = -1 - if (ir_cell-1) < 0: - srl = 1 - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag) - - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag) diff --git a/fbpic/particles/cuda_deposition/linear.py b/fbpic/particles/cuda_deposition/linear.py deleted file mode 100644 index 48adff845..000000000 --- a/fbpic/particles/cuda_deposition/linear.py +++ /dev/null @@ -1,482 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for linear order shapes -""" -from numba import cuda, int64 -import math -from scipy.constants import c -import numpy as np - - -# Shape Factor functions to compute particle shapes. -@cuda.jit(device=True, inline=True) -def get_z_shape_linear(cell_position, index): - iz = int64(math.floor(cell_position)) - if index == 0: - return iz+1.-cell_position - if index == 1: - return cell_position - iz - - -@cuda.jit(device=True, inline=True) -def get_r_shape_linear(cell_position, index): - flip_factor = 1. - ir = int64(math.floor(cell_position)) - if index == 0: - if ir < 0: - flip_factor = -1. - return flip_factor*(ir+1.-cell_position) - if index == 1: - return flip_factor*(cell_position - ir) - - -# ------------------------------- -# Field deposition utility - rho -# ------------------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - int32[:], int32[:])') -def deposit_rho_gpu_linear(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho_m0, rho_m1, - cell_idx, prefix_sum): - """ - Deposition of the charge density rho using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of rho that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 variables (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i / Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare local field arrays - R_m0_00 = 0. - R_m0_01 = 0. - R_m0_10 = 0. - R_m0_11 = 0. - - R_m1_00 = 0. + 0.j - R_m1_01 = 0. + 0.j - R_m1_10 = 0. + 0.j - R_m1_11 = 0. + 0.j - - for j in range(frequency_per_cell): - # Get the particle index before the sorting - # -------------------------------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0_scal = wj * exptheta_m0 - # Mode 1 - R_m1_scal = wj * exptheta_m1 - - # Boundary Region Shifts - ir_lower = int64(math.floor(r_cell)) - - R_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m1_scal - - if ir_lower == -1: - R_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal - else: - R_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal - - # Cell shifts for the simulation boundaries - shift_r = 0 - shift_z = 0 - if ir_cell+1 > (Nr-1): - shift_r = -1 - if iz_cell+1 > Nz-1: - shift_z -= Nz - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real) - cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real) - cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real) - cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real) - cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag) - - -# ------------------------------- -# Field deposition utility - J -# ------------------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:],\ - int32[:], int32[:])') -def deposit_J_gpu_linear(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - j_r_m0, j_r_m1, - j_t_m0, j_t_m1, - j_z_m0, j_z_m1, - cell_idx, prefix_sum): - """ - Deposition of the current J using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of J that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 variables (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i/Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare the local field value for - # all possible deposition directions, - # depending on the shape order and per mode for r,t and z. - - J_r_m0_00 = 0. - J_r_m1_00 = 0. + 0.j - J_t_m0_00 = 0.# + 0.j - J_t_m1_00 = 0. + 0.j - J_z_m0_00 = 0. - J_z_m1_00 = 0. + 0.j - - J_r_m0_01 = 0. - J_r_m1_01 = 0. + 0.j - J_t_m0_01 = 0. - J_t_m1_01 = 0. + 0.j - J_z_m0_01 = 0. - J_z_m1_01 = 0. + 0.j - - J_r_m0_10 = 0. - J_r_m1_10 = 0. + 0.j - J_t_m0_10 = 0. - J_t_m1_10 = 0. + 0.j - J_z_m0_10 = 0. - J_z_m1_10 = 0. + 0.j - - J_r_m0_11 = 0. - J_r_m1_11 = 0. + 0.j - J_t_m0_11 = 0. - J_t_m1_11 = 0. + 0.j - J_z_m0_11 = 0. - J_z_m1_11 = 0. + 0.j - - - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index - # ---------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 - J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 - J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 - # Mode 1 - J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 - J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 - J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 - - # Deposit on local copies at respective position - ir_lower = int64(math.floor(r_cell)) - - J_r_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal - - # Take into account lower r flips - if ir_lower == -1: - J_r_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal - else: - J_r_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal - - # Cell shifts for the simulation boundaries - shift_r = 0 - shift_z = 0 - if (ir_cell+1) > (Nr-1): - shift_r = -1 - if (iz_cell+1) > Nz-1: - shift_z -= Nz - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real) - cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real) - cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real) - cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real) - cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real) - cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real) - cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag) diff --git a/fbpic/particles/cuda_deposition/linear_non_atomic.py b/fbpic/particles/cuda_deposition/linear_non_atomic.py deleted file mode 100644 index 6dea16d21..000000000 --- a/fbpic/particles/cuda_deposition/linear_non_atomic.py +++ /dev/null @@ -1,671 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for linear order shapes -without using atomic operations. -""" -from numba import cuda -import math -from scipy.constants import c -import numpy as np - -# ------------------------------- -# Field deposition utility - rho -# ------------------------------- - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:],\ - int32[:], int32[:])') -def deposit_rho_gpu(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho0, rho1, - rho2, rho3, - cell_idx, prefix_sum): - """ - Deposition of the charge density rho using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of rho that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 arrays (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho0, rho1, rho2, rho3 : 3darray of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz = int(i / Nr) - ir = int(i - iz * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - # Initialize the local field value for - # all four possible deposition directions - # Mode 0, 1 for r, t, z - # 1 : lower in r, lower in z - # 2 : lower in r, upper in z - # 3 : upper in r, lower in z - # 4 : upper in r, upper in z - R1_m0 = 0. + 0.j - R2_m0 = 0. + 0.j - R3_m0 = 0. + 0.j - R4_m0 = 0. + 0.j - # ------------ - R1_m1 = 0. + 0.j - R2_m1 = 0. + 0.j - R3_m1 = 0. + 0.j - R4_m1 = 0. + 0.j - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index before the sorting - # -------------------------------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset - 1 - j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1. / rj - cos = xj * invr # Cosine - sin = yj * invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j * sin - - # Get linear weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr * (rj - rmin) - 0.5 - z_cell = invdz * (zj - zmin) - 0.5 - # Original index of the uppper and lower cell - ir_lower = int(math.floor(r_cell)) - ir_upper = ir_lower + 1 - iz_lower = int(math.floor(z_cell)) - iz_upper = iz_lower + 1 - # Linear weight - Sr_lower = ir_upper - r_cell - Sr_upper = r_cell - ir_lower - Sz_lower = iz_upper - z_cell - Sz_upper = z_cell - iz_lower - # Set guard weights to zero - Sr_guard = 0. - - # Treat the boundary conditions - # -------------------------------------------- - # guard cells in lower r - if ir_lower < 0: - Sr_guard = Sr_lower - Sr_lower = 0. - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr - 1: - ir_lower = Nr - 1 - if ir_upper > Nr - 1: - ir_upper = Nr - 1 - # periodic boundaries in z - # lower z boundaries - if iz_lower < 0: - iz_lower += Nz - if iz_upper < 0: - iz_upper += Nz - # upper z boundaries - if iz_lower > Nz - 1: - iz_lower -= Nz - if iz_upper > Nz - 1: - iz_upper -= Nz - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0 = wj * exptheta_m0 - # Mode 1 - R_m1 = wj * exptheta_m1 - - # Caculate the weighted currents for each - # of the four possible direction - # -------------------------------------------- - if ir_lower == ir_upper: - # In the case that ir_lower and ir_upper are equal, - # the current is added only to the array corresponding - # to ir_lower. - # (This is the case for the boundaries in r) - R1_m0 += Sz_lower * Sr_lower * R_m0 - R1_m0 += Sz_lower * Sr_upper * R_m0 - R3_m0 += Sz_upper * Sr_lower * R_m0 - R3_m0 += Sz_upper * Sr_upper * R_m0 - # ----------------------------- - R1_m1 += Sz_lower * Sr_lower * R_m1 - R1_m1 += Sz_lower * Sr_upper * R_m1 - R3_m1 += Sz_upper * Sr_lower * R_m1 - R3_m1 += Sz_upper * Sr_upper * R_m1 - # ----------------------------- - if ir_lower != ir_upper: - # In the case that ir_lower and ir_upper are different, - # add the current to the four arrays according to - # the direction. - R1_m0 += Sz_lower * Sr_lower * R_m0 - R2_m0 += Sz_lower * Sr_upper * R_m0 - R3_m0 += Sz_upper * Sr_lower * R_m0 - R4_m0 += Sz_upper * Sr_upper * R_m0 - # ----------------------------- - R1_m1 += Sz_lower * Sr_lower * R_m1 - R2_m1 += Sz_lower * Sr_upper * R_m1 - R3_m1 += Sz_upper * Sr_lower * R_m1 - R4_m1 += Sz_upper * Sr_upper * R_m1 - # ----------------------------- - if ir_lower == ir_upper == 0: - # Treat the guard cells. - # Add the current to the guard cells - # for particles that had an original - # cell index < 0. - R1_m0 += -1. * Sz_lower * Sr_guard * R_m0 - R3_m0 += -1. * Sz_upper * Sr_guard * R_m0 - # --------------------------------- - R1_m1 += -1. * Sz_lower * Sr_guard * R_m1 - R3_m1 += -1. * Sz_upper * Sr_guard * R_m1 - # Write the calculated field values to - # the field arrays defined on the interpolation grid - rho0[iz, ir, 0] = R1_m0 - rho0[iz, ir, 1] = R1_m1 - rho1[iz, ir, 0] = R2_m0 - rho1[iz, ir, 1] = R2_m1 - rho2[iz, ir, 0] = R3_m0 - rho2[iz, ir, 1] = R3_m1 - rho3[iz, ir, 0] = R4_m0 - rho3[iz, ir, 1] = R4_m1 - - -@cuda.jit('void(complex128[:,:], complex128[:,:], \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:])') -def add_rho(rho_m0, rho_m1, - rho0, rho1, - rho2, rho3): - """ - Merges the 4 separate field arrays that contain rho for - each deposition direction and adds them to the global - interpolation grid arrays for mode 0 and 1. - - Parameters - ---------- - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) - - rho0, rho1, rho2, rho3 : 3darrays of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes. - """ - # Get the CUDA Grid in 2D - i, j = cuda.grid(2) - # Only for threads within (nz, nr) - if (i < rho_m0.shape[0] and j < rho_m0.shape[1]): - # Sum the four field arrays for the different deposition - # directions and write them to the global field array - rho_m0[i, j] += rho0[i, j, 0] + \ - rho1[i, j - 1, 0] + \ - rho2[i - 1, j, 0] + \ - rho3[i - 1, j - 1, 0] - - rho_m1[i, j] += rho0[i, j, 1] + \ - rho1[i, j - 1, 1] + \ - rho2[i - 1, j, 1] + \ - rho3[i - 1, j - 1, 1] - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:],\ - int32[:], int32[:])') -def deposit_J_gpu(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - J0, J1, - J2, J3, - cell_idx, prefix_sum): - """ - Deposition of the current J using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of J that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 arrays (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - J0, J1, J2, J3 : 3darray of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes and the - 3 directions of J in cylindrical coordinates (r, t, z). - (is mofidied by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz = int(i / Nr) - ir = int(i - iz * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - # Initialize the local field value for - # all four possible deposition directions - # Mode 0, 1 for r, t, z - # 1 : lower in r, lower in z - # 2 : lower in r, upper in z - # 3 : upper in r, lower in z - # 4 : upper in r, upper in z - Jr1_m0 = 0. + 0.j - Jr2_m0 = 0. + 0.j - Jr3_m0 = 0. + 0.j - Jr4_m0 = 0. + 0.j - # ------------- - Jr1_m1 = 0. + 0.j - Jr2_m1 = 0. + 0.j - Jr3_m1 = 0. + 0.j - Jr4_m1 = 0. + 0.j - # ------------- - Jt1_m0 = 0. + 0.j - Jt2_m0 = 0. + 0.j - Jt3_m0 = 0. + 0.j - Jt4_m0 = 0. + 0.j - # ------------- - Jt1_m1 = 0. + 0.j - Jt2_m1 = 0. + 0.j - Jt3_m1 = 0. + 0.j - Jt4_m1 = 0. + 0.j - # ------------- - Jz1_m0 = 0. + 0.j - Jz2_m0 = 0. + 0.j - Jz3_m0 = 0. + 0.j - Jz4_m0 = 0. + 0.j - # ------------- - Jz1_m1 = 0. + 0.j - Jz2_m1 = 0. + 0.j - Jz3_m1 = 0. + 0.j - Jz4_m1 = 0. + 0.j - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index - # ---------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset - 1 - j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1. / rj - cos = xj * invr # Cosine - sin = yj * invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j * sin - - # Get linear weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr * (rj - rmin) - 0.5 - z_cell = invdz * (zj - zmin) - 0.5 - # Original index of the uppper and lower cell - # in r and z - ir_lower = int(math.floor(r_cell)) - ir_upper = ir_lower + 1 - iz_lower = int(math.floor(z_cell)) - iz_upper = iz_lower + 1 - # Linear weight - Sr_lower = ir_upper - r_cell - Sr_upper = r_cell - ir_lower - Sz_lower = iz_upper - z_cell - Sz_upper = z_cell - iz_lower - # Set guard weights to zero - Sr_guard = 0. - - # Treat the boundary conditions - # -------------------------------------------- - # guard cells in lower r - if ir_lower < 0: - Sr_guard = Sr_lower - Sr_lower = 0. - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr - 1: - ir_lower = Nr - 1 - if ir_upper > Nr - 1: - ir_upper = Nr - 1 - # periodic boundaries in z - # lower z boundaries - if iz_lower < 0: - iz_lower += Nz - if iz_upper < 0: - iz_upper += Nz - # upper z boundaries - if iz_lower > Nz - 1: - iz_lower -= Nz - if iz_upper > Nz - 1: - iz_upper -= Nz - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - Jr_m0 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m0 - Jt_m0 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m0 - Jz_m0 = wj * c * inv_gammaj * uzj * exptheta_m0 - # Mode 1 - Jr_m1 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m1 - Jt_m1 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m1 - Jz_m1 = wj * c * inv_gammaj * uzj * exptheta_m1 - - # Caculate the weighted currents for each - # of the four possible direction - # -------------------------------------------- - if ir_lower == ir_upper: - # In the case that ir_lower and ir_upper are equal, - # the current is added only to the array corresponding - # to ir_lower. - # (This is the case for the boundaries in r) - Jr1_m0 += Sz_lower * Sr_lower * Jr_m0 - Jr1_m0 += Sz_lower * Sr_upper * Jr_m0 - Jr3_m0 += Sz_upper * Sr_lower * Jr_m0 - Jr3_m0 += Sz_upper * Sr_upper * Jr_m0 - # ------------------------------- - Jr1_m1 += Sz_lower * Sr_lower * Jr_m1 - Jr1_m1 += Sz_lower * Sr_upper * Jr_m1 - Jr3_m1 += Sz_upper * Sr_lower * Jr_m1 - Jr3_m1 += Sz_upper * Sr_upper * Jr_m1 - # ------------------------------- - Jt1_m0 += Sz_lower * Sr_lower * Jt_m0 - Jt1_m0 += Sz_lower * Sr_upper * Jt_m0 - Jt3_m0 += Sz_upper * Sr_lower * Jt_m0 - Jt3_m0 += Sz_upper * Sr_upper * Jt_m0 - # ------------------------------- - Jt1_m1 += Sz_lower * Sr_lower * Jt_m1 - Jt1_m1 += Sz_lower * Sr_upper * Jt_m1 - Jt3_m1 += Sz_upper * Sr_lower * Jt_m1 - Jt3_m1 += Sz_upper * Sr_upper * Jt_m1 - # ------------------------------- - Jz1_m0 += Sz_lower * Sr_lower * Jz_m0 - Jz1_m0 += Sz_lower * Sr_upper * Jz_m0 - Jz3_m0 += Sz_upper * Sr_lower * Jz_m0 - Jz3_m0 += Sz_upper * Sr_upper * Jz_m0 - # ------------------------------- - Jz1_m1 += Sz_lower * Sr_lower * Jz_m1 - Jz1_m1 += Sz_lower * Sr_upper * Jz_m1 - Jz3_m1 += Sz_upper * Sr_lower * Jz_m1 - Jz3_m1 += Sz_upper * Sr_upper * Jz_m1 - # ------------------------------- - if ir_lower != ir_upper: - # In the case that ir_lower and ir_upper are different, - # add the current to the four arrays according to - # the direction. - Jr1_m0 += Sz_lower * Sr_lower * Jr_m0 - Jr2_m0 += Sz_lower * Sr_upper * Jr_m0 - Jr3_m0 += Sz_upper * Sr_lower * Jr_m0 - Jr4_m0 += Sz_upper * Sr_upper * Jr_m0 - # ------------------------------- - Jr1_m1 += Sz_lower * Sr_lower * Jr_m1 - Jr2_m1 += Sz_lower * Sr_upper * Jr_m1 - Jr3_m1 += Sz_upper * Sr_lower * Jr_m1 - Jr4_m1 += Sz_upper * Sr_upper * Jr_m1 - # ------------------------------- - Jt1_m0 += Sz_lower * Sr_lower * Jt_m0 - Jt2_m0 += Sz_lower * Sr_upper * Jt_m0 - Jt3_m0 += Sz_upper * Sr_lower * Jt_m0 - Jt4_m0 += Sz_upper * Sr_upper * Jt_m0 - # ------------------------------- - Jt1_m1 += Sz_lower * Sr_lower * Jt_m1 - Jt2_m1 += Sz_lower * Sr_upper * Jt_m1 - Jt3_m1 += Sz_upper * Sr_lower * Jt_m1 - Jt4_m1 += Sz_upper * Sr_upper * Jt_m1 - # ------------------------------- - Jz1_m0 += Sz_lower * Sr_lower * Jz_m0 - Jz2_m0 += Sz_lower * Sr_upper * Jz_m0 - Jz3_m0 += Sz_upper * Sr_lower * Jz_m0 - Jz4_m0 += Sz_upper * Sr_upper * Jz_m0 - # ------------------------------- - Jz1_m1 += Sz_lower * Sr_lower * Jz_m1 - Jz2_m1 += Sz_lower * Sr_upper * Jz_m1 - Jz3_m1 += Sz_upper * Sr_lower * Jz_m1 - Jz4_m1 += Sz_upper * Sr_upper * Jz_m1 - # ------------------------------- - if ir_lower == ir_upper == 0: - # Treat the guard cells. - # Add the current to the guard cells - # for particles that had an original - # cell index < 0. - Jr1_m0 += -1. * Sz_lower * Sr_guard * Jr_m0 - Jr3_m0 += -1. * Sz_upper * Sr_guard * Jr_m0 - # ----------------------------------- - Jr1_m1 += -1. * Sz_lower * Sr_guard * Jr_m1 - Jr3_m1 += -1. * Sz_upper * Sr_guard * Jr_m1 - # ----------------------------------- - Jt1_m0 += -1. * Sz_lower * Sr_guard * Jt_m0 - Jt3_m0 += -1. * Sz_upper * Sr_guard * Jt_m0 - # ----------------------------------- - Jt1_m1 += -1. * Sz_lower * Sr_guard * Jt_m1 - Jt3_m1 += -1. * Sz_upper * Sr_guard * Jt_m1 - # ----------------------------------- - Jz1_m0 += -1. * Sz_lower * Sr_guard * Jz_m0 - Jz3_m0 += -1. * Sz_upper * Sr_guard * Jz_m0 - # ----------------------------------- - Jz1_m1 += -1. * Sz_lower * Sr_guard * Jz_m1 - Jz3_m1 += -1. * Sz_upper * Sr_guard * Jz_m1 - # Write the calculated field values to - # the field arrays defined on the interpolation grid - J0[iz, ir, 0] = Jr1_m0 - J0[iz, ir, 1] = Jr1_m1 - J0[iz, ir, 2] = Jt1_m0 - J0[iz, ir, 3] = Jt1_m1 - J0[iz, ir, 4] = Jz1_m0 - J0[iz, ir, 5] = Jz1_m1 - # -------------------- - J1[iz, ir, 0] = Jr2_m0 - J1[iz, ir, 1] = Jr2_m1 - J1[iz, ir, 2] = Jt2_m0 - J1[iz, ir, 3] = Jt2_m1 - J1[iz, ir, 4] = Jz2_m0 - J1[iz, ir, 5] = Jz2_m1 - # -------------------- - J2[iz, ir, 0] = Jr3_m0 - J2[iz, ir, 1] = Jr3_m1 - J2[iz, ir, 2] = Jt3_m0 - J2[iz, ir, 3] = Jt3_m1 - J2[iz, ir, 4] = Jz3_m0 - J2[iz, ir, 5] = Jz3_m1 - # -------------------- - J3[iz, ir, 0] = Jr4_m0 - J3[iz, ir, 1] = Jr4_m1 - J3[iz, ir, 2] = Jt4_m0 - J3[iz, ir, 3] = Jt4_m1 - J3[iz, ir, 4] = Jz4_m0 - J3[iz, ir, 5] = Jz4_m1 - - -@cuda.jit('void(complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:])') -def add_J(Jr_m0, Jr_m1, - Jt_m0, Jt_m1, - Jz_m0, Jz_m1, - J0, J1, - J2, J3): - """ - Merges the 4 separate field arrays that contain J for - each deposition direction and adds them to the global - interpolation grid arrays for mode 0 and 1. - - Parameters - ---------- - Jr_m0, Jr_m1, Jt_m0, Jt_m1, Jz_m0, Jz_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - J0, J1, J2, J3 : 3darrays of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes and - the 3 different components of J (r, t, z). - """ - # Get the CUDA Grid in 2D - i, j = cuda.grid(2) - # Only for threads within (nz, nr) - if (i < Jr_m0.shape[0] and j < Jr_m0.shape[1]): - # Sum the four field arrays for the different deposition - # directions and write them to the global field array - Jr_m0[i, j] += J0[i, j, 0] + \ - J1[i, j - 1, 0] + \ - J2[i - 1, j, 0] + \ - J3[i - 1, j - 1, 0] - - Jr_m1[i, j] += J0[i, j, 1] + \ - J1[i, j - 1, 1] + \ - J2[i - 1, j, 1] + \ - J3[i - 1, j - 1, 1] - - Jt_m0[i, j] += J0[i, j, 2] + \ - J1[i, j - 1, 2] + \ - J2[i - 1, j, 2] + \ - J3[i - 1, j - 1, 2] - - Jt_m1[i, j] += J0[i, j, 3] + \ - J1[i, j - 1, 3] + \ - J2[i - 1, j, 3] + \ - J3[i - 1, j - 1, 3] - - Jz_m0[i, j] += J0[i, j, 4] + \ - J1[i, j - 1, 4] + \ - J2[i - 1, j, 4] + \ - J3[i - 1, j - 1, 4] - - Jz_m1[i, j] += J0[i, j, 5] + \ - J1[i, j - 1, 5] + \ - J2[i - 1, j, 5] + \ - J3[i - 1, j - 1, 5] diff --git a/fbpic/particles/cuda_deposition/__init__.py b/fbpic/particles/deposition/__init__.py similarity index 100% rename from fbpic/particles/cuda_deposition/__init__.py rename to fbpic/particles/deposition/__init__.py diff --git a/fbpic/particles/deposition/cuda_methods.py b/fbpic/particles/deposition/cuda_methods.py new file mode 100644 index 000000000..2686479bf --- /dev/null +++ b/fbpic/particles/deposition/cuda_methods.py @@ -0,0 +1,1683 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the GPU using CUDA. +""" +from numba import cuda, int64 +import math +from scipy.constants import c +import numpy as np + +# ------------------------------- +# Particle shape Factor functions +# ------------------------------- + +# Linear shapes +@cuda.jit(device=True, inline=True) +def z_shape_linear(cell_position, index): + iz = int64(math.floor(cell_position)) + if index == 0: + return iz+1.-cell_position + if index == 1: + return cell_position - iz + +@cuda.jit(device=True, inline=True) +def r_shape_linear(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(ir+1.-cell_position) + if index == 1: + return flip_factor*(cell_position - ir) + +# Cubic shapes +@cuda.jit(device=True, inline=True) +def z_shape_cubic(cell_position, index): + iz = int64(math.floor(cell_position)) - 1 + if index == 0: + return (-1./6.)*((cell_position-iz)-2)**3 + if index == 1: + return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) + if index == 2: + return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) + if index == 3: + return (-1./6.)*(((iz+3)-cell_position)-2)**3 + +@cuda.jit(device=True, inline=True) +def r_shape_cubic(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) - 1 + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 + if index == 1: + if ir+1 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) + if index == 2: + if ir+2 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) + if index == 3: + if ir+3 < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 + +# ------------------------------- +# Field deposition - linear - rho +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + int32[:], int32[:])') +def deposit_rho_gpu_linear(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0, rho_m1, + cell_idx, prefix_sum): + """ + Deposition of the charge density rho using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of rho that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 4 variables (one for each possible direction, + e.g. upper in z, lower in r) to maintain parallelism while + avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0, rho_m1 : 2darrays of complexs + The charge density on the interpolation grid for + mode 0 and 1. (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i / Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare local field arrays + R_m0_00 = 0. + R_m0_01 = 0. + R_m0_10 = 0. + R_m0_11 = 0. + + R_m1_00 = 0. + 0.j + R_m1_01 = 0. + 0.j + R_m1_10 = 0. + 0.j + R_m1_11 = 0. + 0.j + + for j in range(frequency_per_cell): + # Get the particle index before the sorting + # -------------------------------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Boundary Region Shifts + ir_lower = int64(math.floor(r_cell)) + + R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal + + if ir_lower == -1: + R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + else: + R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if ir_cell+1 > (Nr-1): + shift_r = -1 + if iz_cell+1 > Nz-1: + shift_z -= Nz + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real) + cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real) + cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real) + cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real) + cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag) + + +# ------------------------------- +# Field deposition - linear - J +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:],\ + int32[:], int32[:])') +def deposit_J_gpu_linear(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0, j_r_m1, + j_t_m0, j_t_m1, + j_z_m0, j_z_m1, + cell_idx, prefix_sum): + """ + Deposition of the current J using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of J that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 4 variables (one for each possible direction, + e.g. upper in z, lower in r) to maintain parallelism while + avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs + The current component in each direction (r, t, z) + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i/Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + + J_r_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m0_00 = 0.# + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m0_00 = 0. + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m0_01 = 0. + J_t_m1_01 = 0. + 0.j + J_z_m0_01 = 0. + J_z_m1_01 = 0. + 0.j + + J_r_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m0_10 = 0. + J_t_m1_10 = 0. + 0.j + J_z_m0_10 = 0. + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m0_11 = 0. + J_t_m1_11 = 0. + 0.j + J_z_m0_11 = 0. + J_z_m1_11 = 0. + 0.j + + + # Loop over the number of particles per cell + for j in range(frequency_per_cell): + # Get the particle index + # ---------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Deposit on local copies at respective position + ir_lower = int64(math.floor(r_cell)) + + J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Take into account lower r flips + if ir_lower == -1: + J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + else: + J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if (ir_cell+1) > (Nr-1): + shift_r = -1 + if (iz_cell+1) > Nz-1: + shift_z -= Nz + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real) + cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real) + cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real) + cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real) + cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real) + cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real) + cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag) + +# ------------------------------- +# Field deposition - cubic - rho +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + int32[:], int32[:])') +def deposit_rho_gpu_cubic(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0, rho_m1, + cell_idx, prefix_sum): + """ + Deposition of the charge density rho using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of rho that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 16 variables (one for each surrounding cell) to + maintain parallelism while avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0, rho_m1 : 2darrays of complexs + The charge density on the interpolation grid for + mode 0 and 1. (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i / Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare local field arrays + R_m0_00 = 0. + R_m1_00 = 0. + 0.j + + R_m0_01 = 0. + R_m1_01 = 0. + 0.j + + R_m0_02 = 0. + R_m1_02 = 0. + 0.j + + R_m0_03 = 0. + R_m1_03 = 0. + 0.j + + R_m0_10 = 0. + R_m1_10 = 0. + 0.j + + R_m0_11 = 0. + R_m1_11 = 0. + 0.j + + R_m0_12 = 0. + R_m1_12 = 0. + 0.j + + R_m0_13 = 0. + R_m1_13 = 0. + 0.j + + R_m0_20 = 0. + R_m1_20 = 0. + 0.j + + R_m0_21 = 0. + R_m1_21 = 0. + 0.j + + R_m0_22 = 0. + R_m1_22 = 0. + 0.j + + R_m0_23 = 0. + R_m1_23 = 0. + 0.j + + R_m0_30 = 0. + R_m1_30 = 0. + 0.j + + R_m0_31 = 0. + R_m1_31 = 0. + 0.j + + R_m0_32 = 0. + R_m1_32 = 0. + 0.j + + R_m0_33 = 0. + R_m1_33 = 0. + 0.j + + for j in range(frequency_per_cell): + # Get the particle index before the sorting + # -------------------------------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + # Compute values in local copies and consider boundaries + ir0 = int64(math.floor(r_cell)) - 1 + + if (ir0 == -2): + R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + if (ir0 == -1): + R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + if (ir0 >= 0): + R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag) + + +# ------------------------------- +# Field deposition - cubic - J +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:],\ + int32[:], int32[:])') +def deposit_J_gpu_cubic(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0, j_r_m1, + j_t_m0, j_t_m1, + j_z_m0, j_z_m1, + cell_idx, prefix_sum): + """ + Deposition of the current J using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of J that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 16 variables (one for each cell) to maintain + parallelism while avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs + The current component in each direction (r, t, z) + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i/Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + J_r_m0_00 = 0. + J_t_m0_00 = 0. + J_z_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_t_m0_01 = 0. + J_z_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m1_01 = 0. + 0.j + J_z_m1_01 = 0. + 0.j + + J_r_m0_02 = 0. + J_t_m0_02 = 0. + J_z_m0_02 = 0. + J_r_m1_02 = 0. + 0.j + J_t_m1_02 = 0. + 0.j + J_z_m1_02 = 0. + 0.j + + J_r_m0_03 = 0. + J_t_m0_03 = 0. + J_z_m0_03 = 0. + J_r_m1_03 = 0. + 0.j + J_t_m1_03 = 0. + 0.j + J_z_m1_03 = 0. + 0.j + + J_r_m0_10 = 0. + J_t_m0_10 = 0. + J_z_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m1_10 = 0. + 0.j + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_t_m0_11 = 0. + J_z_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m1_11 = 0. + 0.j + J_z_m1_11 = 0. + 0.j + + J_r_m0_12 = 0. + J_t_m0_12 = 0. + J_z_m0_12 = 0. + J_r_m1_12 = 0. + 0.j + J_t_m1_12 = 0. + 0.j + J_z_m1_12 = 0. + 0.j + + J_r_m0_13 = 0. + J_t_m0_13 = 0. + J_z_m0_13 = 0. + J_r_m1_13 = 0. + 0.j + J_t_m1_13 = 0. + 0.j + J_z_m1_13 = 0. + 0.j + + J_r_m0_20 = 0. + J_t_m0_20 = 0. + J_z_m0_20 = 0. + J_r_m1_20 = 0. + 0.j + J_t_m1_20 = 0. + 0.j + J_z_m1_20 = 0. + 0.j + + J_r_m0_21 = 0. + J_t_m0_21 = 0. + J_z_m0_21 = 0. + J_r_m1_21 = 0. + 0.j + J_t_m1_21 = 0. + 0.j + J_z_m1_21 = 0. + 0.j + + J_r_m0_22 = 0. + J_t_m0_22 = 0. + J_z_m0_22 = 0. + J_r_m1_22 = 0. + 0.j + J_t_m1_22 = 0. + 0.j + J_z_m1_22 = 0. + 0.j + + J_r_m0_23 = 0. + J_t_m0_23 = 0. + J_z_m0_23 = 0. + J_r_m1_23 = 0. + 0.j + J_t_m1_23 = 0. + 0.j + J_z_m1_23 = 0. + 0.j + + J_r_m0_30 = 0. + J_t_m0_30 = 0. + J_z_m0_30 = 0. + J_r_m1_30 = 0. + 0.j + J_t_m1_30 = 0. + 0.j + J_z_m1_30 = 0. + 0.j + + J_r_m0_31 = 0. + J_t_m0_31 = 0. + J_z_m0_31 = 0. + J_r_m1_31 = 0. + 0.j + J_t_m1_31 = 0. + 0.j + J_z_m1_31 = 0. + 0.j + + J_r_m0_32 = 0. + J_t_m0_32 = 0. + J_z_m0_32 = 0. + J_r_m1_32 = 0. + 0.j + J_t_m1_32 = 0. + 0.j + J_z_m1_32 = 0. + 0.j + + J_r_m0_33 = 0. + J_t_m0_33 = 0. + J_z_m0_33 = 0. + J_r_m1_33 = 0. + 0.j + J_t_m1_33 = 0. + 0.j + J_z_m1_33 = 0. + 0.j + + # Loop over the number of particles per cell + for j in range(frequency_per_cell): + # Get the particle index + # ---------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Compute values in local copies and consider boundaries + ir0 = int64(math.floor(r_cell)) - 1 + + if (ir0 == -2): + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + + J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir0 == -1): + J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir0 >= 0): + J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag) + + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag) diff --git a/fbpic/particles/deposition/numba_methods.py b/fbpic/particles/deposition/numba_methods.py new file mode 100644 index 000000000..54fedeb39 --- /dev/null +++ b/fbpic/particles/deposition/numba_methods.py @@ -0,0 +1,65 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the CPU with numba. +""" +import numba + +@numba.njit +def deposit_field_numba(Fptcl, Fgrid, + iz, ir, Sz, Sr, sign_guards): + """ + Perform the deposition using numba + + Parameters + ---------- + Fptcl : 1darray of complexs + (one element per macroparticle) + Contains the charge or current for each macroparticle (already + multiplied by exp(im theta), from which to do the deposition + + Fgrid : 2darray of complexs + Contains the fields on the interpolation grid. + Is modified by this function + + iz, ir : 2darray of ints + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the index of the cells that each macroparticle + will deposit to. + + Sz, Sr: 2darray of floats + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the weight for respective cells from iz and ir, + for each macroparticle. + + sign_guards : float + The sign (+1 or -1) with which the weight of the guard cells should + be added to the 0th cell. + """ + + # Get the total number of particles + Ntot = len(Fptcl) + + # Loop over all particles + for ip in range(Ntot): + # Loop over adjacent cells (given by shape order) + # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid + # modifying ir and Sr in place. (This is not strictly necessary, + # but is just here as a safeguard.) + for cell_index_r in range(ir.shape[0]): + for cell_index_z in range(iz.shape[0]): + # Correct the guard cell index and sign + if ir[cell_index_r, ip] < 0: + ir_corr = abs(ir[cell_index_r, ip]) - 1 + Sr_corr = sign_guards * Sr[cell_index_r, ip] + else: + ir_corr = ir[cell_index_r, ip] + Sr_corr = Sr[cell_index_r, ip] + # Deposit field from particle to the respective grid point + Fgrid[ iz[cell_index_z, ip], ir_corr ] += \ + Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip] diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py new file mode 100644 index 000000000..4f38862a6 --- /dev/null +++ b/fbpic/particles/deposition/threading_methods.py @@ -0,0 +1,1530 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the CPU with threading. +""" +import numba +from numba import prange, int64 +import math +from scipy.constants import c + +# ------------------------------- +# Particle shape Factor functions +# ------------------------------- + +# Linear shapes +@numba.njit +def z_shape_linear(cell_position, index): + iz = int64(math.floor(cell_position)) + if index == 0: + return iz+1.-cell_position + if index == 1: + return cell_position - iz + +@numba.njit +def r_shape_linear(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(ir+1.-cell_position) + if index == 1: + return flip_factor*(cell_position - ir) + +# Cubic shapes +@numba.njit +def z_shape_cubic(cell_position, index): + iz = int64(math.floor(cell_position)) - 1 + if index == 0: + return (-1./6.)*((cell_position-iz)-2)**3 + if index == 1: + return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) + if index == 2: + return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) + if index == 3: + return (-1./6.)*(((iz+3)-cell_position)-2)**3 + +@numba.njit +def r_shape_cubic(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) - 1 + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 + if index == 1: + if ir+1 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) + if index == 2: + if ir+2 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) + if index == 3: + if ir+3 < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 + +# ------------------------------- +# Field deposition - linear - rho +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_rho_prange_linear(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0_global, rho_m1_global, + nthreads, ptcl_chunk_indices): + """ + Deposition of the charge density rho using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to a global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of rho that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local charge densities + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for i_thread in prange( nthreads ): + # Loop over all particles in thread chunk + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] + # Weights + wj = w[i_ptcl] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Boundary Region Shifts + ir_flip = int( math.floor(r_cell) ) + + # Declare local field array + R_m0_00 = 0. + R_m0_01 = 0. + R_m0_10 = 0. + R_m0_11 = 0. + + R_m1_00 = 0. + 0.j + R_m1_01 = 0. + 0.j + R_m1_10 = 0. + 0.j + R_m1_11 = 0. + 0.j + + R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal + + if ir_flip == -1: + R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + else: + R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if ir_cell+1 > (Nr-1): + shift_r = -1 + if iz_cell+1 > Nz-1: + shift_z -= Nz + + # Write ptcl fields to thread-local part of global deposition array + rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_00 + rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_00 + + rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m0_01 + rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m1_01 + + rho_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m0_10 + rho_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m1_10 + + rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 + rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 + + return + +# ------------------------------- +# Field deposition - linear - J +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_J_prange_linear(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0_global, j_r_m1_global, + j_t_m0_global, j_t_m1_global, + j_z_m0_global, j_z_m1_global, + nthreads, ptcl_chunk_indices): + """ + Deposition of the current density J using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of J that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local current component + in each direction (r, t, z) on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for i_thread in prange( nthreads ): + # Loop over all particles in thread chunk + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] + # Velocity + uxj = ux[i_ptcl] + uyj = uy[i_ptcl] + uzj = uz[i_ptcl] + # Inverse gamma + inv_gammaj = inv_gamma[i_ptcl] + # Weights + wj = w[i_ptcl] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Boundary Region Shifts + ir_flip = int( math.floor(r_cell) ) + + # Declare local field arrays + J_r_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m0_00 = 0. + J_t_m1_00 = 0. + 0.j + J_z_m0_00 = 0. + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m0_01 = 0. + J_t_m1_01 = 0. + 0.j + J_z_m0_01 = 0. + J_z_m1_01 = 0. + 0.j + + J_r_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m0_10 = 0. + J_t_m1_10 = 0. + 0.j + J_z_m0_10 = 0. + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m0_11 = 0. + J_t_m1_11 = 0. + 0.j + J_z_m0_11 = 0. + J_z_m1_11 = 0. + 0.j + + J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Take into account lower r flips + if ir_flip == -1: + J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + else: + J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if (ir_cell+1) > (Nr-1): + shift_r = -1 + if (iz_cell+1) > Nz-1: + shift_z -= Nz + + # Write ptcl fields to thread-local part of global deposition array + j_r_m0_global[i_thread,iz_cell, ir_cell] += J_r_m0_00 + j_r_m1_global[i_thread,iz_cell, ir_cell] += J_r_m1_00 + + j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 + j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 + + j_r_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 + j_r_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 + + j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 + j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 + + j_t_m0_global[i_thread,iz_cell, ir_cell] += J_t_m0_00 + j_t_m1_global[i_thread,iz_cell, ir_cell] += J_t_m1_00 + + j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 + j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 + + j_t_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 + j_t_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 + + j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 + j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 + + j_z_m0_global[i_thread,iz_cell, ir_cell] += J_z_m0_00 + j_z_m1_global[i_thread,iz_cell, ir_cell] += J_z_m1_00 + + j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 + j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 + + j_z_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 + j_z_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 + + j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 + j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 + + return + + +# ------------------------------- +# Field deposition - cubic - rho +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_rho_prange_cubic(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0_global, rho_m1_global, + nthreads, ptcl_chunk_indices): + """ + Deposition of the charge density rho using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of rho that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local charge densities + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for i_thread in prange( nthreads ): + # Loop over all particles in thread chunk + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] + # Weights + wj = w[i_ptcl] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Compute values in local copies and consider boundaries + ir_flip = int( math.floor(r_cell) ) - 1 + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode. + R_m0_00 = 0. + R_m1_00 = 0. + 0.j + + R_m0_01 = 0. + R_m1_01 = 0. + 0.j + + R_m0_02 = 0. + R_m1_02 = 0. + 0.j + + R_m0_03 = 0. + R_m1_03 = 0. + 0.j + + R_m0_10 = 0. + R_m1_10 = 0. + 0.j + + R_m0_11 = 0. + R_m1_11 = 0. + 0.j + + R_m0_12 = 0. + R_m1_12 = 0. + 0.j + + R_m0_13 = 0. + R_m1_13 = 0. + 0.j + + R_m0_20 = 0. + R_m1_20 = 0. + 0.j + + R_m0_21 = 0. + R_m1_21 = 0. + 0.j + + R_m0_22 = 0. + R_m1_22 = 0. + 0.j + + R_m0_23 = 0. + R_m1_23 = 0. + 0.j + + R_m0_30 = 0. + R_m1_30 = 0. + 0.j + + R_m0_31 = 0. + R_m1_31 = 0. + 0.j + + R_m0_32 = 0. + R_m1_32 = 0. + 0.j + + R_m0_33 = 0. + R_m1_33 = 0. + 0.j + + if (ir_flip == -2): + R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + if (ir_flip == -1): + R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + if (ir_flip >= 0): + R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Write ptcl fields to thread-local part of global deposition array + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 + rho_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m0_01 + rho_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m1_01 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m0_10 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m1_10 + rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_11 + rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_11 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m0_12 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m1_12 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m0_13 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m1_13 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 + rho_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m0_21 + rho_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m1_21 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 + rho_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m0_31 + rho_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m1_31 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 + + return + +# ------------------------------- +# Field deposition - cubic - J +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_J_prange_cubic(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0_global, j_r_m1_global, + j_t_m0_global, j_t_m1_global, + j_z_m0_global, j_z_m1_global, + nthreads, ptcl_chunk_indices): + """ + Deposition of the current density J using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of J that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local current component + in each direction (r, t, z) on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for i_thread in prange( nthreads ): + # Loop over all particles in thread chunk + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] + # Velocity + uxj = ux[i_ptcl] + uyj = uy[i_ptcl] + uzj = uz[i_ptcl] + # Inverse gamma + inv_gammaj = inv_gamma[i_ptcl] + # Weights + wj = w[i_ptcl] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Compute values in local copies and consider boundaries + ir_flip = int64(math.floor(r_cell)) - 1 + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + J_r_m0_00 = 0. + J_t_m0_00 = 0. + J_z_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_t_m0_01 = 0. + J_z_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m1_01 = 0. + 0.j + J_z_m1_01 = 0. + 0.j + + J_r_m0_02 = 0. + J_t_m0_02 = 0. + J_z_m0_02 = 0. + J_r_m1_02 = 0. + 0.j + J_t_m1_02 = 0. + 0.j + J_z_m1_02 = 0. + 0.j + + J_r_m0_03 = 0. + J_t_m0_03 = 0. + J_z_m0_03 = 0. + J_r_m1_03 = 0. + 0.j + J_t_m1_03 = 0. + 0.j + J_z_m1_03 = 0. + 0.j + + J_r_m0_10 = 0. + J_t_m0_10 = 0. + J_z_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m1_10 = 0. + 0.j + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_t_m0_11 = 0. + J_z_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m1_11 = 0. + 0.j + J_z_m1_11 = 0. + 0.j + + J_r_m0_12 = 0. + J_t_m0_12 = 0. + J_z_m0_12 = 0. + J_r_m1_12 = 0. + 0.j + J_t_m1_12 = 0. + 0.j + J_z_m1_12 = 0. + 0.j + + J_r_m0_13 = 0. + J_t_m0_13 = 0. + J_z_m0_13 = 0. + J_r_m1_13 = 0. + 0.j + J_t_m1_13 = 0. + 0.j + J_z_m1_13 = 0. + 0.j + + J_r_m0_20 = 0. + J_t_m0_20 = 0. + J_z_m0_20 = 0. + J_r_m1_20 = 0. + 0.j + J_t_m1_20 = 0. + 0.j + J_z_m1_20 = 0. + 0.j + + J_r_m0_21 = 0. + J_t_m0_21 = 0. + J_z_m0_21 = 0. + J_r_m1_21 = 0. + 0.j + J_t_m1_21 = 0. + 0.j + J_z_m1_21 = 0. + 0.j + + J_r_m0_22 = 0. + J_t_m0_22 = 0. + J_z_m0_22 = 0. + J_r_m1_22 = 0. + 0.j + J_t_m1_22 = 0. + 0.j + J_z_m1_22 = 0. + 0.j + + J_r_m0_23 = 0. + J_t_m0_23 = 0. + J_z_m0_23 = 0. + J_r_m1_23 = 0. + 0.j + J_t_m1_23 = 0. + 0.j + J_z_m1_23 = 0. + 0.j + + J_r_m0_30 = 0. + J_t_m0_30 = 0. + J_z_m0_30 = 0. + J_r_m1_30 = 0. + 0.j + J_t_m1_30 = 0. + 0.j + J_z_m1_30 = 0. + 0.j + + J_r_m0_31 = 0. + J_t_m0_31 = 0. + J_z_m0_31 = 0. + J_r_m1_31 = 0. + 0.j + J_t_m1_31 = 0. + 0.j + J_z_m1_31 = 0. + 0.j + + J_r_m0_32 = 0. + J_t_m0_32 = 0. + J_z_m0_32 = 0. + J_r_m1_32 = 0. + 0.j + J_t_m1_32 = 0. + 0.j + J_z_m1_32 = 0. + 0.j + + J_r_m0_33 = 0. + J_t_m0_33 = 0. + J_z_m0_33 = 0. + J_r_m1_33 = 0. + 0.j + J_t_m1_33 = 0. + 0.j + J_z_m1_33 = 0. + 0.j + + if (ir_flip == -2): + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + + J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir_flip == -1): + J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir_flip >= 0): + J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 + j_r_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m0_01 + j_r_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m1_01 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 + j_r_m0_global[i_thread, iz_cell, ir_cell] += J_r_m0_11 + j_r_m1_global[i_thread, iz_cell, ir_cell] += J_r_m1_11 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m0_12 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m1_12 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 + j_r_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m0_21 + j_r_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m1_21 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 + j_r_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 + j_r_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 + + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 + j_t_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m0_01 + j_t_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m1_01 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 + j_t_m0_global[i_thread, iz_cell, ir_cell] += J_t_m0_11 + j_t_m1_global[i_thread, iz_cell, ir_cell] += J_t_m1_11 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m0_12 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m1_12 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 + j_t_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m0_21 + j_t_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m1_21 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 + j_t_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 + j_t_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 + + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 + j_z_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m0_01 + j_z_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m1_01 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 + j_z_m0_global[i_thread, iz_cell, ir_cell] += J_z_m0_11 + j_z_m1_global[i_thread, iz_cell, ir_cell] += J_z_m1_11 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m0_12 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m1_12 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 + j_z_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m0_21 + j_z_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m1_21 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 + j_z_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 + j_z_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 + + return + +# ----------------------------------------------------------------------- +# Parallel reduction of the global arrays for threads into a single array +# ----------------------------------------------------------------------- + +@numba.njit( parallel=True ) +def sum_reduce_2d_array( global_array, reduced_array ): + """ + Sum the array `global_array` along its first axis and + add it into `reduced_array`. + + Parameters: + ----------- + global_array: 3darray of complexs + Field array whose first dimension corresponds to the + reduction dimension (typically: the number of threads used + during the current deposition) + + reduced array: 2darray of complexs + """ + # Extract size of each dimension + Nreduce, Nz, Nr = global_array.shape + + # Parallel loop over iz + for iz in prange( Nz ): + # Loop over the reduction dimension (slow dimension) + for it in range( Nreduce ): + # Loop over ir (fast dimension) + for ir in range( Nr ): + + reduced_array[ iz, ir ] += global_array[ it, iz, ir ] diff --git a/fbpic/particles/gathering/__init__.py b/fbpic/particles/gathering/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py similarity index 62% rename from fbpic/particles/cuda_methods.py rename to fbpic/particles/gathering/cuda_methods.py index dedc1f2e7..12c763860 100644 --- a/fbpic/particles/cuda_methods.py +++ b/fbpic/particles/gathering/cuda_methods.py @@ -3,202 +3,14 @@ # License: 3-Clause-BSD-LBNL """ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the optimized particles methods that use cuda on a GPU +It defines the field gathering methods linear and cubic order shapes +on the GPU using CUDA. """ from numba import cuda, float64, int64 -from accelerate.cuda import sorting import math -from scipy.constants import c, e -import numpy as np # ----------------------- -# Particle pusher utility -# ----------------------- - -@cuda.jit(device=True, inline=True) -def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, - Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): - """ - Push at single macroparticle, using the Vay pusher - """ - # Get the magnetic rotation vector - taux = bconst*Bx - tauy = bconst*By - tauz = bconst*Bz - tau2 = taux**2 + tauy**2 + tauz**2 - - # Get the momenta at the half timestep - uxp = ux_i + econst*Ex \ - + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) - uyp = uy_i + econst*Ey \ - + inv_gamma_i*( uz_i*taux - ux_i*tauz ) - uzp = uz_i + econst*Ez \ - + inv_gamma_i*( ux_i*tauy - uy_i*taux ) - sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 - utau = uxp*taux + uyp*tauy + uzp*tauz - - # Get the new 1./gamma - inv_gamma_f = math.sqrt( - 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) - - # Reuse the tau and utau arrays to save memory - tx = inv_gamma_f*taux - ty = inv_gamma_f*tauy - tz = inv_gamma_f*tauz - ut = inv_gamma_f*utau - s = 1./( 1 + tau2*inv_gamma_f**2 ) - - # Get the new u - ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) - uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) - uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) - - return( ux_f, uy_f, uz_f, inv_gamma_f ) - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64, float64, int32, float64)') -def push_p_gpu( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, - q, m, Ntot, dt ) : - """ - Advance the particles' momenta, using cuda on the GPU - - Parameters - ---------- - ux, uy, uz : 1darray of floats - The velocity of the particles - (is modified by this function) - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - - q : float - The charge of the particle species - - m : float - The mass of the particle species - - Ntot : int - The total number of particles - - dt : float - The time by which the momenta is advanced - """ - # Set a few constants - econst = q*dt/(m*c) - bconst = 0.5*q*dt/m - - #Cuda 1D grid - ip = cuda.grid(1) - - # Loop over the particles - if ip < Ntot: - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64, int32, float64, int16[:])') -def push_p_ioniz_gpu( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, - m, Ntot, dt, ionization_level ) : - """ - Advance the particles' momenta, using numba on the GPU - This take into account that the particles are ionizable, and thus - that their charge is determined by `ionization_level` - - Parameters - ---------- - ux, uy, uz : 1darray of floats - The velocity of the particles - (is modified by this function) - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - - m : float - The mass of the particle species - - Ntot : int - The total number of particles - - dt : float - The time by which the momenta is advanced - - ionization_level : 1darray of ints - The number of electrons that each ion is missing - (compared to a neutral atom) - """ - #Cuda 1D grid - ip = cuda.grid(1) - - # Loop over the particles - if ip < Ntot: - if ionization_level[ip] != 0: - # Set a few constants - econst = ionization_level[ip] * e * dt/(m*c) - bconst = 0.5 * ionization_level[ip] * e * dt/m - # Use the Vay pusher - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) - -@cuda.jit('void(float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64)') -def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) : - """ - Advance the particles' positions over one half-timestep - - This assumes that the positions (x, y, z) are initially either - one half-timestep *behind* the momenta (ux, uy, uz), or at the - same timestep as the momenta. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - (is modified by this function) - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - dt : float (seconds) - The time by which the position is advanced - """ - # Half timestep, multiplied by c - chdt = c*0.5*dt - - i = cuda.grid(1) - if i < x.shape[0]: - # Particle push - inv_g = inv_gamma[i] - x[i] += chdt*inv_g*ux[i] - y[i] += chdt*inv_g*uy[i] - z[i] += chdt*inv_g*uz[i] - -# ----------------------- -# Field gathering utility +# Field gathering linear # ----------------------- @cuda.jit('void(float64[:], float64[:], float64[:], \ @@ -521,6 +333,9 @@ def gather_field_gpu_linear(x, y, z, By[i] = sin*Fr + cos*Ft Bz[i] = Fz +# ----------------------- +# Field gathering cubic +# ----------------------- @cuda.jit('void(float64[:], float64[:], float64[:], \ float64, float64, int32, \ @@ -615,8 +430,8 @@ def gather_field_gpu_cubic(x, y, z, z_cell = invdz*(zj - zmin) - 0.5 # Calculate the shape factors - Sr = cuda.local.array((4,), dtype=float64) ir = cuda.local.array((4,), dtype=int64) + Sr = cuda.local.array((4,), dtype=float64) ir[0] = int64(math.floor(r_cell)) - 1 ir[1] = ir[0] + 1 ir[2] = ir[1] + 1 @@ -779,219 +594,4 @@ def gather_field_gpu_cubic(x, y, z, # and write to particle field arrays Bx[i] = cos*Fr - sin*Ft By[i] = sin*Fr + cos*Ft - Bz[i] = Fz - -# ----------------------------------------------------- -# Sorting utilities - get_cell_idx / sort / prefix_sum -# ----------------------------------------------------- - -@cuda.jit('void(int32[:], uint32[:], \ - float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32)') -def get_cell_idx_per_particle(cell_idx, sorted_idx, - x, y, z, - invdz, zmin, Nz, - invdr, rmin, Nr): - """ - Get the cell index of each particle. - The cell index is 1d and calculated by: - cell index in z + cell index in r * number of cells in z. - The cell_idx of a particle is defined by - the lower cell in r and z, that it deposits its field to. - - Parameters - ---------- - cell_idx : 1darray of integers - The cell index of the particle - - sorted_idx : 1darray of integers - The sorted index array needs to be reset - before doing the sort - - x, y, z : 1darray of floats (in meters) - The position of the particles - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, in each direction - - Nz, Nr : int - Number of gridpoints along the considered direction - """ - i = cuda.grid(1) - if i < cell_idx.shape[0]: - # Preliminary arrays for the cylindrical conversion - xj = x[i] - yj = y[i] - zj = z[i] - rj = math.sqrt( xj**2 + yj**2 ) - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Original index of the uppper and lower cell - ir_lower = int(math.floor( r_cell )) - iz_lower = int(math.floor( z_cell )) - - # Treat the boundary conditions - # guard cells in lower r - if ir_lower < 0: - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr-1: - ir_lower = Nr-1 - # periodic boundaries in z - if iz_lower < 0: - iz_lower += Nz - if iz_lower > Nz-1: - iz_lower -= Nz - - # Reset sorted_idx array - sorted_idx[i] = i - # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr - cell_idx[i] = ir_lower + iz_lower * Nr - -def sort_particles_per_cell(cell_idx, sorted_idx): - """ - Sort the cell index of the particles and - modify the sorted index array accordingly. - - Parameters - ---------- - cell_idx : 1darray of integers - The cell index of the particle - - sorted_idx : 1darray of integers - Represents the original index of the - particle before the sorting. - """ - Ntot = cell_idx.shape[0] - if Ntot > 0: - sorter = sorting.RadixSort(Ntot, dtype = np.int32) - sorter.sort(cell_idx, vals = sorted_idx) - -@cuda.jit('void(int32[:], int32[:])') -def incl_prefix_sum(cell_idx, prefix_sum): - """ - Perform an inclusive parallel prefix sum on the sorted - cell index array. The prefix sum array represents the - cumulative sum of the number of particles per cell - for each cell index. - - Parameters - ---------- - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # i is the index of the macroparticle - i = cuda.grid(1) - if i < cell_idx.shape[0]-1: - # ci: index of the cell of the present macroparticle - ci = cell_idx[i] - # ci_next: index of the cell of the next macroparticle - ci_next = cell_idx[i+1] - # Fill all the cells between ci and ci_next with the - # inclusive cumulative sum of the number particles until ci - while ci < ci_next: - # The cumulative sum of the number of particle per cell - # until ci is i+1 (since i obeys python index, starting at 0) - prefix_sum[ci] = i+1 - ci += 1 - # The last "macroparticle" of the cell_idx array fills up the - # rest of the prefix sum array - if i == cell_idx.shape[0]-1: - # Get the cell_index of the last macroparticle - ci = cell_idx[i] - # Fill all the remaining entries of the prefix sum array - for empty_index in range(ci, prefix_sum.shape[0]): - prefix_sum[empty_index] = i+1 - -@cuda.jit('void(int32[:])') -def reset_prefix_sum(prefix_sum): - """ - Resets the prefix sum. Sets all the values - to zero. - - Parameters - ---------- - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - i = cuda.grid(1) - if i < prefix_sum.shape[0]: - prefix_sum[i] = 0 - -@cuda.jit('void(uint32[:], float64[:], float64[:])') -def write_sorting_buffer(sorted_idx, val, buf): - """ - Writes the values of a particle array to a buffer, - while rearranging them to match the sorted cell index array. - - Parameters - ---------- - sorted_idx : 1darray of integers - Represents the original index of the - particle before the sorting - - val : 1d array of floats - A particle data array - - buf : 1d array of floats - A buffer array to temporarily store the - sorted particle data array - """ - i = cuda.grid(1) - if i < val.shape[0]: - buf[i] = val[sorted_idx[i]] - -# ----------------------------------------------------- -# Device array creation utility (will be removed later) -# ----------------------------------------------------- - -def cuda_deposition_arrays(Nz = None, Nr = None, fieldtype = None): - """ - Create empty arrays on the GPU for the charge and - current deposition in each of the 4 possible direction. - - ########################################### - # Needs to be moved to the fields package! - ########################################### - - Parameters - ---------- - Nz : int - Number of cells in z. - Nr : int - Number of cells in r. - - fieldtype : string - Either 'rho' or 'J'. - """ - # Create empty arrays to store the four different possible - # cell directions a particle can deposit to. - if fieldtype == 'rho': - # Rho - third dimension represents 2 modes - rho0 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - rho1 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - rho2 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - rho3 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - return rho0, rho1, rho2, rho3 - - if fieldtype == 'J': - # J - third dimension represents 2 modes - # times 3 dimensions (r, t, z) - J0 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - J1 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - J2 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - J3 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - return J0, J1, J2, J3 + Bz[i] = Fz \ No newline at end of file diff --git a/fbpic/particles/gathering/numba_methods.py b/fbpic/particles/gathering/numba_methods.py new file mode 100644 index 000000000..f85a79ea0 --- /dev/null +++ b/fbpic/particles/gathering/numba_methods.py @@ -0,0 +1,82 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the field gathering methods linear and cubic order shapes +on the CPU with numba. +""" +import numba + +@numba.jit(nopython=True) +def gather_field_numba(exptheta, m, Fgrid, Fptcl, + iz, ir, Sz, Sr, sign_guards): + """ + Perform the weighted sum using numba + + Parameters + ---------- + exptheta : 1darray of complexs + (one element per macroparticle) + Contains exp(-im theta) for each macroparticle + + m : int + Index of the mode. + Determines wether a factor 2 should be applied + + Fgrid : 2darray of complexs + Contains the fields on the interpolation grid, + from which to do the gathering + + Fptcl : 1darray of floats + (one element per macroparticle) + Contains the fields for each macroparticle + Is modified by this function + + iz, ir : 2darray of ints + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the index of the cells that each macroparticle + will gather from. + + Sz, Sr: 2darray of floats + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the weight for respective cells from iz and ir, + for each macroparticle. + + sign_guards : float + The sign (+1 or -1) with which the weight of the guard cells should + be added to the 0th cell. + """ + # Get the total number of particles + Ntot = len(Fptcl) + + # Loop over the particles + for ip in range(Ntot): + # Erase the temporary variable + F = 0.j + # Loop over all the adjacent cells (given by shape order) + # Use helper variables `ir_corr` and `Sr_corr`. + # This is necessary, because ir and Sr should **not** be modified + # **in-place**. (This is because ir and Sr are reused several + # times, as we call the present function 3 times, with different + # values for sign_guards.) + for cell_index_r in range(ir.shape[0]): + for cell_index_z in range(iz.shape[0]): + # Correct the guard cell index and sign + if ir[cell_index_r, ip] < 0: + ir_corr = abs(ir[cell_index_r, ip]) - 1 + Sr_corr = sign_guards * Sr[cell_index_r, ip] + else: + ir_corr = ir[cell_index_r, ip] + Sr_corr = Sr[cell_index_r, ip] + # Gather the field value at the respective grid point + F += Sz[cell_index_z, ip] * Sr_corr * \ + Fgrid[ iz[cell_index_z, ip], ir_corr] + + # Add the complex phase + if m == 0: + Fptcl[ip] += (F * exptheta[ip]).real + if m > 0: + Fptcl[ip] += 2 * (F * exptheta[ip]).real \ No newline at end of file diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py new file mode 100644 index 000000000..9809fcce5 --- /dev/null +++ b/fbpic/particles/gathering/threading_methods.py @@ -0,0 +1,601 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the field gathering methods linear and cubic order shapes +on the CPU with threading +""" +from numba import prange, int64 +import numba +import math +import numpy as np + +# ----------------------- +# Field gathering linear +# ----------------------- + +@numba.njit(parallel=True) +def gather_field_prange_linear(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba with multi-threading. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (linear). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Deposit the field per cell in parallel + for i in prange(x.shape[0]): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt( xj**2 + yj**2 ) + if (rj !=0. ) : + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else : + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get linear weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + # Original index of the uppper and lower cell + ir_lower = int(math.floor( r_cell )) + ir_upper = ir_lower + 1 + iz_lower = int(math.floor( z_cell )) + iz_upper = iz_lower + 1 + # Linear weight + Sr_lower = ir_upper - r_cell + Sr_upper = r_cell - ir_lower + Sz_lower = iz_upper - z_cell + Sz_upper = z_cell - iz_lower + # Set guard weights to zero + Sr_guard = 0. + + # Treat the boundary conditions + # -------------------------------------------- + # guard cells in lower r + if ir_lower < 0: + Sr_guard = Sr_lower + Sr_lower = 0. + ir_lower = 0 + # absorbing in upper r + if ir_lower > Nr-1: + ir_lower = Nr-1 + if ir_upper > Nr-1: + ir_upper = Nr-1 + # periodic boundaries in z + # lower z boundaries + if iz_lower < 0: + iz_lower += Nz + if iz_upper < 0: + iz_upper += Nz + # upper z boundaries + if iz_lower > Nz-1: + iz_lower -= Nz + if iz_upper > Nz-1: + iz_upper -= Nz + + #Precalculate Shapes + S_ll = Sz_lower*Sr_lower + S_lu = Sz_lower*Sr_upper + S_ul = Sz_upper*Sr_lower + S_uu = Sz_upper*Sr_upper + S_lg = Sz_lower*Sr_guard + S_ug = Sz_upper*Sr_guard + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Ez_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Ez_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Er_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Et_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Er_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Et_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = cos*Fr - sin*Ft + Ey[i] = sin*Fr + cos*Ft + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Bz_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Bz_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ] + + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Br_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Bt_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Br_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Bt_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + + return Ex, Ey, Ez, Bx, By, Bz + +# ----------------------- +# Field gathering cubic +# ----------------------- + +@numba.njit(parallel=True) +def gather_field_prange_cubic(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba with multi-threading. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (cubic). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Deposit the field per cell in parallel + # (for threads < number of particles) + for i in prange(x.shape[0]): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particle, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the shape factors + ir = np.zeros(4, dtype=int64) #[0,0,0,0] + Sr = np.zeros(4) #[0.,0.,0.,0.] + ir[0] = int64(math.floor(r_cell)) - 1 + ir[1] = ir[0] + 1 + ir[2] = ir[1] + 1 + ir[3] = ir[2] + 1 + Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3 + Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) + Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) + Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 + iz = np.zeros(4, dtype=int64) #[0,0,0,0] + Sz = np.zeros(4) #[0.,0.,0.,0.] + iz[0] = int64(math.floor(z_cell)) - 1 + iz[1] = iz[0] + 1 + iz[2] = iz[1] + 1 + iz[3] = iz[2] + 1 + Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3 + Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4) + Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) + Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 + # Lower and upper periodic boundary for z + index_z = 0 + while index_z < 4: + if iz[index_z] < 0: + iz[index_z] += Nz + if iz[index_z] > Nz - 1: + iz[index_z] -= Nz + index_z += 1 + # Lower and upper boundary for r + index_r = 0 + while index_r < 4: + if ir[index_r] < 0: + ir[index_r] = abs(ir[index_r])-1 + Sr[index_r] = (-1.)*Sr[index_r] + if ir[index_r] > Nr - 1: + ir[index_r] = Nr - 1 + index_r += 1 + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = (cos*Fr - sin*Ft) + Ey[i] = (sin*Fr + cos*Ft) + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + + # Add the fields for mode 1 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + + return Ex, Ey, Ez, Bx, By, Bz diff --git a/fbpic/particles/numba_methods.py b/fbpic/particles/numba_methods.py deleted file mode 100644 index 17d94a84a..000000000 --- a/fbpic/particles/numba_methods.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the optimized particles methods that use numba on a CPU -""" -import numba -import math -from scipy.constants import c, e - -# ----------------------- -# Particle pusher utility -# ----------------------- - -@numba.jit(nopython=True) -def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): - """ - Advance the particles' positions over one half-timestep - - This assumes that the positions (x, y, z) are initially either - one half-timestep *behind* the momenta (ux, uy, uz), or at the - same timestep as the momenta. - """ - # Half timestep, multiplied by c - chdt = c*0.5*dt - - # Particle push - for ip in range(Ntot) : - x[ip] += chdt * inv_gamma[ip] * ux[ip] - y[ip] += chdt * inv_gamma[ip] * uy[ip] - z[ip] += chdt * inv_gamma[ip] * uz[ip] - -@numba.jit(nopython=True) -def push_p_numba( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : - """ - Advance the particles' momenta, using numba - """ - # Set a few constants - econst = q*dt/(m*c) - bconst = 0.5*q*dt/m - - # Loop over the particles - for ip in range(Ntot) : - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) - -@numba.jit(nopython=True) -def push_p_ioniz_numba( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : - """ - Advance the particles' momenta, using numba - """ - # Set a few constants - prefactor_econst = e*dt/(m*c) - prefactor_bconst = 0.5*e*dt/m - - # Loop over the particles - for ip in range(Ntot) : - - # For neutral macroparticles, skip this step - if ionization_level[ip] == 0: - continue - - # Calculate the charge dependent constants - econst = prefactor_econst * ionization_level[ip] - bconst = prefactor_bconst * ionization_level[ip] - # Perform the push - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], - econst, bconst ) - -@numba.jit(nopython=True) -def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, - Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): - """ - Push at single macroparticle, using the Vay pusher - """ - # Get the magnetic rotation vector - taux = bconst*Bx - tauy = bconst*By - tauz = bconst*Bz - tau2 = taux**2 + tauy**2 + tauz**2 - - # Get the momenta at the half timestep - uxp = ux_i + econst*Ex \ - + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) - uyp = uy_i + econst*Ey \ - + inv_gamma_i*( uz_i*taux - ux_i*tauz ) - uzp = uz_i + econst*Ez \ - + inv_gamma_i*( ux_i*tauy - uy_i*taux ) - sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 - utau = uxp*taux + uyp*tauy + uzp*tauz - - # Get the new 1./gamma - inv_gamma_f = math.sqrt( - 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) - - # Reuse the tau and utau variables to save memory - tx = inv_gamma_f*taux - ty = inv_gamma_f*tauy - tz = inv_gamma_f*tauz - ut = inv_gamma_f*utau - s = 1./( 1 + tau2*inv_gamma_f**2 ) - - # Get the new u - ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) - uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) - uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) - - return( ux_f, uy_f, uz_f, inv_gamma_f ) - -# ----------------------- -# Field gathering utility -# ----------------------- - -@numba.jit(nopython=True) -def gather_field_numba(exptheta, m, Fgrid, Fptcl, - iz, ir, Sz, Sr, sign_guards): - """ - Perform the weighted sum using numba - - Parameters - ---------- - exptheta : 1darray of complexs - (one element per macroparticle) - Contains exp(-im theta) for each macroparticle - - m : int - Index of the mode. - Determines wether a factor 2 should be applied - - Fgrid : 2darray of complexs - Contains the fields on the interpolation grid, - from which to do the gathering - - Fptcl : 1darray of floats - (one element per macroparticle) - Contains the fields for each macroparticle - Is modified by this function - - iz, ir : 2darray of ints - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the index of the cells that each macroparticle - will gather from. - - Sz, Sr: 2darray of floats - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the weight for respective cells from iz and ir, - for each macroparticle. - - sign_guards : float - The sign (+1 or -1) with which the weight of the guard cells should - be added to the 0th cell. - """ - # Get the total number of particles - Ntot = len(Fptcl) - - # Loop over the particles - for ip in range(Ntot): - # Erase the temporary variable - F = 0.j - # Loop over all the adjacent cells (given by shape order) - # Use helper variables `ir_corr` and `Sr_corr`. - # This is necessary, because ir and Sr should **not** be modified - # **in-place**. (This is because ir and Sr are reused several - # times, as we call the present function 3 times, with different - # values for sign_guards.) - for cell_index_r in range(ir.shape[0]): - for cell_index_z in range(iz.shape[0]): - # Correct the guard cell index and sign - if ir[cell_index_r, ip] < 0: - ir_corr = abs(ir[cell_index_r, ip]) - 1 - Sr_corr = sign_guards * Sr[cell_index_r, ip] - else: - ir_corr = ir[cell_index_r, ip] - Sr_corr = Sr[cell_index_r, ip] - # Gather the field value at the respective grid point - F += Sz[cell_index_z, ip] * Sr_corr * \ - Fgrid[ iz[cell_index_z, ip], ir_corr] - - # Add the complex phase - if m == 0: - Fptcl[ip] += (F * exptheta[ip]).real - if m > 0: - Fptcl[ip] += 2 * (F * exptheta[ip]).real - -# ------------------------- -# Charge deposition utility -# ------------------------- - -@numba.jit(nopython=True) -def deposit_field_numba(Fptcl, Fgrid, - iz, ir, Sz, Sr, sign_guards): - """ - Perform the deposition using numba - - Parameters - ---------- - Fptcl : 1darray of complexs - (one element per macroparticle) - Contains the charge or current for each macroparticle (already - multiplied by exp(im theta), from which to do the deposition - - Fgrid : 2darray of complexs - Contains the fields on the interpolation grid. - Is modified by this function - - iz, ir : 2darray of ints - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the index of the cells that each macroparticle - will deposit to. - - Sz, Sr: 2darray of floats - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the weight for respective cells from iz and ir, - for each macroparticle. - - sign_guards : float - The sign (+1 or -1) with which the weight of the guard cells should - be added to the 0th cell. - """ - - # Get the total number of particles - Ntot = len(Fptcl) - - # Loop over all particles - for ip in range(Ntot): - # Loop over adjacent cells (given by shape order) - # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid - # modifying ir and Sr in place. (This is not strictly necessary, - # but is just here as a safeguard.) - for cell_index_r in range(ir.shape[0]): - for cell_index_z in range(iz.shape[0]): - # Correct the guard cell index and sign - if ir[cell_index_r, ip] < 0: - ir_corr = abs(ir[cell_index_r, ip]) - 1 - Sr_corr = sign_guards * Sr[cell_index_r, ip] - else: - ir_corr = ir[cell_index_r, ip] - Sr_corr = Sr[cell_index_r, ip] - # Deposit field from particle to the respective grid point - Fgrid[ iz[cell_index_z, ip], ir_corr ] += \ - Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip] diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index f9a15a945..63e4a871c 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -9,28 +9,37 @@ from scipy.constants import c, e from .ionization import Ionizer from .tracking import ParticleTracker +import numba # Load the utility methods -from .utility_methods import weights, unalign_angles -# Load the numba routines -from .numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba, \ - gather_field_numba, deposit_field_numba +from .utilities.utility_methods import weights, unalign_angles +# Load the numba methods +from .push.numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba +from .deposition.numba_methods import deposit_field_numba +from .gathering.numba_methods import gather_field_numba + +# Check if threading is available, then import threaded functions +from fbpic.threading_utils import threading_enabled +if threading_enabled: + from .deposition.threading_methods import deposit_rho_prange_linear, \ + deposit_J_prange_linear, deposit_rho_prange_cubic, \ + deposit_J_prange_cubic, sum_reduce_2d_array + from .gathering.threading_methods import gather_field_prange_linear, \ + gather_field_prange_cubic # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed if cuda_installed: + # Load the CUDA methods from fbpic.cuda_utils import cuda, cuda_tpb_bpg_1d, cuda_tpb_bpg_2d - from .cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu, \ - gather_field_gpu_linear, gather_field_gpu_cubic, \ - write_sorting_buffer, cuda_deposition_arrays, \ + from .push.cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu + from .deposition.cuda_methods import deposit_rho_gpu_linear, \ + deposit_J_gpu_linear, deposit_rho_gpu_cubic, deposit_J_gpu_cubic + from .gathering.cuda_methods import gather_field_gpu_linear, \ + gather_field_gpu_cubic + from .utilities.cuda_sorting import write_sorting_buffer, \ get_cell_idx_per_particle, sort_particles_per_cell, \ reset_prefix_sum, incl_prefix_sum - from .cuda_deposition.cubic import deposit_rho_gpu_cubic, \ - deposit_J_gpu_cubic - from .cuda_deposition.linear import deposit_rho_gpu_linear, \ - deposit_J_gpu_linear - from .cuda_deposition.linear_non_atomic import deposit_rho_gpu, \ - deposit_J_gpu, add_rho, add_J class Particles(object) : """ @@ -50,7 +59,8 @@ def __init__(self, q, m, n, Npz, zmin, zmax, ux_m=0., uy_m=0., uz_m=0., ux_th=0., uy_th=0., uz_th=0., dens_func=None, continuous_injection=True, - use_cuda=False, grid_shape=None, particle_shape='linear' ) : + grid_shape=None, particle_shape='linear', + use_cuda=False ) : """ Initialize a uniform set of particles @@ -100,9 +110,6 @@ def dens_func( z, r ) ... Whether to continuously inject the particles, in the case of a moving window - use_cuda : bool, optional - Wether to use the GPU or not. - grid_shape: tuple, optional Needed when running on the GPU The shape of the local grid (including guard cells), i.e. @@ -111,15 +118,16 @@ def dens_func( z, r ) ... particle_shape: str, optional Set the particle shape for the charge/current deposition. - Possible values are 'cubic', 'linear' and 'linear_non_atomic'. - While 'cubic' corresponds to third order shapes and 'linear' - to first order shapes, 'linear_non_atomic' uses an equivalent - deposition scheme to 'linear' which avoids atomics on the GPU. + Possible values are 'linear' and 'cubic' for first and third + order particle shape factors. + + use_cuda : bool, optional + Wether to use the GPU or not. """ # Register the timestep self.dt = dt - # Define wether or not to use the GPU + # Define whether or not to use the GPU self.use_cuda = use_cuda if (self.use_cuda==True) and (cuda_installed==False) : print('*** Cuda not available for the particles.') @@ -198,6 +206,9 @@ def dens_func( z, r ) ... if dens_func is not None : self.w[:] = self.w * dens_func( self.z, r ) + # Register particle shape + self.particle_shape = particle_shape + # Allocate arrays and register variables when using CUDA if self.use_cuda: if grid_shape is None: @@ -213,9 +224,11 @@ def dens_func( z, r ) ... dtype=np.int32 ) # Register boolean that records if the particles are sorted or not self.sorted = False - - # Register particle shape - self.particle_shape = particle_shape + # Register variables when using multithreading + self.use_threading = threading_enabled + if self.use_threading == True: + # Register number of threads + self.nthreads = numba.config.NUMBA_NUM_THREADS def send_particles_to_gpu( self ): """ @@ -423,6 +436,7 @@ def push_p( self ) : half-timestep *behind* the positions (x, y, z), and it brings them one half-timestep *ahead* of the positions. """ + # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) @@ -441,7 +455,8 @@ def push_p( self ) : self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) - else : + # CPU version + else: if self.ionizer is None: push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, @@ -461,6 +476,7 @@ def halfpush_x( self ) : one half-timestep *behind* the momenta (ux, uy, uz), or at the same timestep as the momenta. """ + # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) @@ -471,7 +487,8 @@ def halfpush_x( self ) : self.inv_gamma, self.dt ) # The particle array is unsorted after the push in x self.sorted = False - else : + # CPU version + else: push_x_numba( self.x, self.y, self.z, self.ux, self.uy, self.uz, self.inv_gamma, self.Ntot, self.dt ) @@ -489,12 +506,24 @@ def gather( self, grid ) : (one InterpolationGrid object per azimuthal mode) Contains the field values on the interpolation grid """ - if self.use_cuda == True: + # GPU (CUDA) version + if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) # Call the CUDA Kernel for the gathering of E and B Fields # for Mode 0 and 1 only. - if self.particle_shape == 'cubic': + if self.particle_shape == 'linear': + gather_field_gpu_linear[dim_grid_1d, dim_block_1d]( + self.x, self.y, self.z, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + grid[0].Er, grid[0].Et, grid[0].Ez, + grid[1].Er, grid[1].Et, grid[1].Ez, + grid[0].Br, grid[0].Bt, grid[0].Bz, + grid[1].Br, grid[1].Bt, grid[1].Bz, + self.Ex, self.Ey, self.Ez, + self.Bx, self.By, self.Bz) + elif self.particle_shape == 'cubic': gather_field_gpu_cubic[dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, @@ -506,7 +535,24 @@ def gather( self, grid ) : self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) else: - gather_field_gpu_linear[dim_grid_1d, dim_block_1d]( + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + # CPU multi-threading version + elif self.use_threading: + if self.particle_shape == 'linear': + gather_field_prange_linear( + self.x, self.y, self.z, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + grid[0].Er, grid[0].Et, grid[0].Ez, + grid[1].Er, grid[1].Et, grid[1].Ez, + grid[0].Br, grid[0].Bt, grid[0].Bz, + grid[1].Br, grid[1].Bt, grid[1].Bz, + self.Ex, self.Ey, self.Ez, + self.Bx, self.By, self.Bz) + elif self.particle_shape == 'cubic': + gather_field_prange_cubic( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, @@ -516,6 +562,11 @@ def gather( self, grid ) : grid[1].Br, grid[1].Bt, grid[1].Bz, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) + else: + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + # CPU single-core version else: # Preliminary arrays for the cylindrical conversion r = np.sqrt( self.x**2 + self.y**2 ) @@ -624,19 +675,13 @@ def deposit( self, fld, fieldtype ) : """ # Shortcut for the list of InterpolationGrid objects grid = fld.interp - - if self.use_cuda == True: + # GPU (CUDA) version + if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_2d_flat, dim_block_2d_flat = cuda_tpb_bpg_1d( grid[0].Nz*grid[0].Nr ) dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d( grid[0].Nz, grid[0].Nr ) - - # Create the helper arrays for deposition - if self.particle_shape == 'linear_non_atomic': - d_F0, d_F1, d_F2, d_F3 = cuda_deposition_arrays( - grid[0].Nz, grid[0].Nr, fieldtype=fieldtype) - # Sort the particles if self.sorted is False: self.sort_particles(fld=fld) @@ -648,17 +693,13 @@ def deposit( self, fld, fieldtype ) : # Rho if fieldtype == 'rho': # Deposit rho in each of four directions - if self.particle_shape == 'linear_non_atomic': - deposit_rho_gpu[dim_grid_2d_flat, dim_block_2d_flat]( + if self.particle_shape == 'linear': + deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, - d_F0, d_F1, d_F2, d_F3, - self.cell_idx, self.prefix_sum) - # Add the four directions together - add_rho[dim_grid_2d, dim_block_2d]( grid[0].rho, grid[1].rho, - d_F0, d_F1, d_F2, d_F3) + self.cell_idx, self.prefix_sum) elif self.particle_shape == 'cubic': deposit_rho_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, @@ -666,33 +707,23 @@ def deposit( self, fld, fieldtype ) : grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].rho, grid[1].rho, self.cell_idx, self.prefix_sum) - elif self.particle_shape == 'linear': - deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( - self.x, self.y, self.z, self.w, - grid[0].invdz, grid[0].zmin, grid[0].Nz, - grid[0].invdr, grid[0].rmin, grid[0].Nr, - grid[0].rho, grid[1].rho, - self.cell_idx, self.prefix_sum) else: - raise ValueError("`particle_shape` should be either 'linear', 'linear_atomic' \ - or 'cubic' but is `%s`" % self.particle_shape) + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) # J elif fieldtype == 'J': # Deposit J in each of four directions - if self.particle_shape == 'linear_non_atomic': - deposit_J_gpu[dim_grid_2d_flat, dim_block_2d_flat]( + if self.particle_shape == 'linear': + deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, - d_F0, d_F1, d_F2, d_F3, - self.cell_idx, self.prefix_sum) - # Add the four directions together - add_J[dim_grid_2d, dim_block_2d]( grid[0].Jr, grid[1].Jr, grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, - d_F0, d_F1, d_F2, d_F3) + self.cell_idx, self.prefix_sum) elif self.particle_shape == 'cubic': deposit_J_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, @@ -703,26 +734,114 @@ def deposit( self, fld, fieldtype ) : grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, self.cell_idx, self.prefix_sum) - elif self.particle_shape == 'linear': - deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( + else: + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + else: + raise ValueError("`fieldtype` should be either 'J' or \ + 'rho', but is `%s`" % fieldtype) + # CPU multi-threading version + elif self.use_threading: + + # Divide particles in chunks (each chunk is handled by a different + # thread) and register the indices that bound each chunks + n_avg_per_thread = int( self.Ntot/self.nthreads ) + # Attribute n_avg_per_thread to each thread (except the last one) + ptcl_chunk_indices = np.array( + [ i_chk*n_avg_per_thread for i_chk in range(self.nthreads+1) ], + dtype=np.uint64 ) + ptcl_chunk_indices[-1] = self.Ntot + # Multithreading functions for the deposition of rho or J + # for Mode 0 and 1 only. + if fieldtype == 'rho': + # Generate temporary arrays for rho + rho_m0_global = np.zeros( + (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]), + dtype=grid[0].rho.dtype ) + rho_m1_global = np.zeros( + (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]), + dtype=grid[1].rho.dtype ) + # Deposit rho using CPU threading + if self.particle_shape == 'linear': + deposit_rho_prange_linear( + self.x, self.y, self.z, self.w, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + rho_m0_global, rho_m1_global, + self.nthreads, ptcl_chunk_indices ) + elif self.particle_shape == 'cubic': + deposit_rho_prange_cubic( + self.x, self.y, self.z, self.w, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + rho_m0_global, rho_m1_global, + self.nthreads, ptcl_chunk_indices ) + else: + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + # Sum thread-local results to main field array + sum_reduce_2d_array( rho_m0_global, grid[0].rho ) + sum_reduce_2d_array( rho_m1_global, grid[1].rho ) + + elif fieldtype == 'J': + # Generate temporary arrays for J + Jr_m0_global = np.zeros( + (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]), + dtype=grid[0].Jr.dtype ) + Jt_m0_global = np.zeros( + (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]), + dtype=grid[0].Jt.dtype ) + Jz_m0_global = np.zeros( + (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]), + dtype=grid[0].Jz.dtype ) + Jr_m1_global = np.zeros( + (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]), + dtype=grid[1].Jr.dtype ) + Jt_m1_global = np.zeros( + (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]), + dtype=grid[1].Jt.dtype ) + Jz_m1_global = np.zeros( + (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]), + dtype=grid[1].Jz.dtype ) + # Deposit J using CPU threading + if self.particle_shape == 'linear': + deposit_J_prange_linear( self.x, self.y, self.z, self.w, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, - grid[0].Jr, grid[1].Jr, - grid[0].Jt, grid[1].Jt, - grid[0].Jz, grid[1].Jz, - self.cell_idx, self.prefix_sum) + Jr_m0_global, Jr_m1_global, + Jt_m0_global, Jt_m1_global, + Jz_m0_global, Jz_m1_global, + self.nthreads, ptcl_chunk_indices ) + elif self.particle_shape == 'cubic': + deposit_J_prange_cubic( + self.x, self.y, self.z, self.w, + self.ux, self.uy, self.uz, self.inv_gamma, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + Jr_m0_global, Jr_m1_global, + Jt_m0_global, Jt_m1_global, + Jz_m0_global, Jz_m1_global, + self.nthreads, ptcl_chunk_indices ) else: raise ValueError("`particle_shape` should be either \ - 'linear', 'linear_atomic' or 'cubic' \ + 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) + # Sum thread-local results to main field array + sum_reduce_2d_array( Jr_m0_global, grid[0].Jr ) + sum_reduce_2d_array( Jt_m0_global, grid[0].Jt ) + sum_reduce_2d_array( Jz_m0_global, grid[0].Jz ) + sum_reduce_2d_array( Jr_m1_global, grid[1].Jr ) + sum_reduce_2d_array( Jt_m1_global, grid[1].Jt ) + sum_reduce_2d_array( Jz_m1_global, grid[1].Jz ) + else: raise ValueError("`fieldtype` should be either 'J' or \ 'rho', but is `%s`" % fieldtype) - - - # CPU version + # CPU single-core version else: # Preliminary arrays for the cylindrical conversion r = np.sqrt( self.x**2 + self.y**2 ) @@ -745,7 +864,7 @@ def deposit( self, fld, fieldtype ) : # number of elements in the grid list Nm = len(grid) - if fieldtype == 'rho' : + if fieldtype == 'rho': # --------------------------------------- # Deposit the charge density mode by mode # --------------------------------------- @@ -765,7 +884,7 @@ def deposit( self, fld, fieldtype ) : deposit_field_numba(self.w*exptheta, grid[m].rho, iz, ir, Sz, Sr, -1.) - elif fieldtype == 'J' : + elif fieldtype == 'J': # ---------------------------------------- # Deposit the current density mode by mode # ---------------------------------------- @@ -793,9 +912,9 @@ def deposit( self, fld, fieldtype ) : deposit_field_numba(Jz*exptheta, grid[m].Jz, iz, ir, Sz, Sr, -1.) - else : - raise ValueError( - "`fieldtype` should be either 'J' or 'rho', but is `%s`" %fieldtype ) + else: + raise ValueError("`fieldtype` should be either 'J' or \ + 'rho', but is `%s`" % fieldtype) def sort_particles(self, fld): """ diff --git a/fbpic/particles/push/__init__.py b/fbpic/particles/push/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/push/cuda_methods.py b/fbpic/particles/push/cuda_methods.py new file mode 100644 index 000000000..41d48d12a --- /dev/null +++ b/fbpic/particles/push/cuda_methods.py @@ -0,0 +1,191 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle push methods on the GPU using CUDA. +""" +from numba import cuda +import math +from scipy.constants import c, e + +@cuda.jit('void(float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64)') +def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) : + """ + Advance the particles' positions over one half-timestep + + This assumes that the positions (x, y, z) are initially either + one half-timestep *behind* the momenta (ux, uy, uz), or at the + same timestep as the momenta. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + (is modified by this function) + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + dt : float (seconds) + The time by which the position is advanced + """ + # Half timestep, multiplied by c + chdt = c*0.5*dt + + i = cuda.grid(1) + if i < x.shape[0]: + # Particle push + inv_g = inv_gamma[i] + x[i] += chdt*inv_g*ux[i] + y[i] += chdt*inv_g*uy[i] + z[i] += chdt*inv_g*uz[i] + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64, float64, int32, float64)') +def push_p_gpu( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, + q, m, Ntot, dt ) : + """ + Advance the particles' momenta, using cuda on the GPU + + Parameters + ---------- + ux, uy, uz : 1darray of floats + The velocity of the particles + (is modified by this function) + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + + q : float + The charge of the particle species + + m : float + The mass of the particle species + + Ntot : int + The total number of particles + + dt : float + The time by which the momenta is advanced + """ + # Set a few constants + econst = q*dt/(m*c) + bconst = 0.5*q*dt/m + + #Cuda 1D grid + ip = cuda.grid(1) + + # Loop over the particles + if ip < Ntot: + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64, int32, float64, int16[:])') +def push_p_ioniz_gpu( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, + m, Ntot, dt, ionization_level ) : + """ + Advance the particles' momenta, using numba on the GPU + This take into account that the particles are ionizable, and thus + that their charge is determined by `ionization_level` + + Parameters + ---------- + ux, uy, uz : 1darray of floats + The velocity of the particles + (is modified by this function) + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + + m : float + The mass of the particle species + + Ntot : int + The total number of particles + + dt : float + The time by which the momenta is advanced + + ionization_level : 1darray of ints + The number of electrons that each ion is missing + (compared to a neutral atom) + """ + #Cuda 1D grid + ip = cuda.grid(1) + + # Loop over the particles + if ip < Ntot: + if ionization_level[ip] != 0: + # Set a few constants + econst = ionization_level[ip] * e * dt/(m*c) + bconst = 0.5 * ionization_level[ip] * e * dt/m + # Use the Vay pusher + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) + +@cuda.jit(device=True, inline=True) +def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, + Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): + """ + Push at single macroparticle, using the Vay pusher + """ + # Get the magnetic rotation vector + taux = bconst*Bx + tauy = bconst*By + tauz = bconst*Bz + tau2 = taux**2 + tauy**2 + tauz**2 + + # Get the momenta at the half timestep + uxp = ux_i + econst*Ex \ + + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) + uyp = uy_i + econst*Ey \ + + inv_gamma_i*( uz_i*taux - ux_i*tauz ) + uzp = uz_i + econst*Ez \ + + inv_gamma_i*( ux_i*tauy - uy_i*taux ) + sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 + utau = uxp*taux + uyp*tauy + uzp*tauz + + # Get the new 1./gamma + inv_gamma_f = math.sqrt( + 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) + + # Reuse the tau and utau arrays to save memory + tx = inv_gamma_f*taux + ty = inv_gamma_f*tauy + tz = inv_gamma_f*tauz + ut = inv_gamma_f*utau + s = 1./( 1 + tau2*inv_gamma_f**2 ) + + # Get the new u + ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) + uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) + uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) + + return( ux_f, uy_f, uz_f, inv_gamma_f ) \ No newline at end of file diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py new file mode 100644 index 000000000..6e3843e1b --- /dev/null +++ b/fbpic/particles/push/numba_methods.py @@ -0,0 +1,117 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle push methods on the CPU with numba. +""" +import math +import numba +from fbpic.threading_utils import njit_parallel, prange +from scipy.constants import c, e + +@njit_parallel +def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): + """ + Advance the particles' positions over one half-timestep + + This assumes that the positions (x, y, z) are initially either + one half-timestep *behind* the momenta (ux, uy, uz), or at the + same timestep as the momenta. + """ + # Half timestep, multiplied by c + chdt = c*0.5*dt + + # Particle push (in parallel if threading is installed) + for ip in prange(Ntot) : + x[ip] += chdt * inv_gamma[ip] * ux[ip] + y[ip] += chdt * inv_gamma[ip] * uy[ip] + z[ip] += chdt * inv_gamma[ip] * uz[ip] + + return x, y, z + +@njit_parallel +def push_p_numba( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : + """ + Advance the particles' momenta, using numba + """ + # Set a few constants + econst = q*dt/(m*c) + bconst = 0.5*q*dt/m + + # Loop over the particles (in parallel if threading is installed) + for ip in prange(Ntot) : + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) + + return ux, uy, uz, inv_gamma + +@njit_parallel +def push_p_ioniz_numba( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : + """ + Advance the particles' momenta, using numba + """ + # Set a few constants + prefactor_econst = e*dt/(m*c) + prefactor_bconst = 0.5*e*dt/m + + # Loop over the particles (in parallel if threading is installed) + for ip in prange(Ntot) : + + # For neutral macroparticles, skip this step + if ionization_level[ip] == 0: + continue + + # Calculate the charge dependent constants + econst = prefactor_econst * ionization_level[ip] + bconst = prefactor_bconst * ionization_level[ip] + # Perform the push + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], + econst, bconst ) + + return ux, uy, uz, inv_gamma + +@numba.njit +def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, + Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): + """ + Push at single macroparticle, using the Vay pusher + """ + # Get the magnetic rotation vector + taux = bconst*Bx + tauy = bconst*By + tauz = bconst*Bz + tau2 = taux**2 + tauy**2 + tauz**2 + + # Get the momenta at the half timestep + uxp = ux_i + econst*Ex \ + + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) + uyp = uy_i + econst*Ey \ + + inv_gamma_i*( uz_i*taux - ux_i*tauz ) + uzp = uz_i + econst*Ez \ + + inv_gamma_i*( ux_i*tauy - uy_i*taux ) + sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 + utau = uxp*taux + uyp*tauy + uzp*tauz + + # Get the new 1./gamma + inv_gamma_f = math.sqrt( + 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) + + # Reuse the tau and utau variables to save memory + tx = inv_gamma_f*taux + ty = inv_gamma_f*tauy + tz = inv_gamma_f*tauz + ut = inv_gamma_f*utau + s = 1./( 1 + tau2*inv_gamma_f**2 ) + + # Get the new u + ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) + uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) + uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) + + return( ux_f, uy_f, uz_f, inv_gamma_f ) diff --git a/fbpic/particles/utilities/__init__.py b/fbpic/particles/utilities/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/utilities/cuda_sorting.py b/fbpic/particles/utilities/cuda_sorting.py new file mode 100644 index 000000000..45a1ddb9f --- /dev/null +++ b/fbpic/particles/utilities/cuda_sorting.py @@ -0,0 +1,184 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle sorting methods on the GPU using CUDA. +""" +from numba import cuda +from accelerate.cuda import sorting +import math +import numpy as np + +# ----------------------------------------------------- +# Sorting utilities - get_cell_idx / sort / prefix_sum +# ----------------------------------------------------- + +@cuda.jit('void(int32[:], uint32[:], \ + float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32)') +def get_cell_idx_per_particle(cell_idx, sorted_idx, + x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr): + """ + Get the cell index of each particle. + The cell index is 1d and calculated by: + cell index in z + cell index in r * number of cells in z. + The cell_idx of a particle is defined by + the lower cell in r and z, that it deposits its field to. + + Parameters + ---------- + cell_idx : 1darray of integers + The cell index of the particle + + sorted_idx : 1darray of integers + The sorted index array needs to be reset + before doing the sort + + x, y, z : 1darray of floats (in meters) + The position of the particles + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, in each direction + + Nz, Nr : int + Number of gridpoints along the considered direction + """ + i = cuda.grid(1) + if i < cell_idx.shape[0]: + # Preliminary arrays for the cylindrical conversion + xj = x[i] + yj = y[i] + zj = z[i] + rj = math.sqrt( xj**2 + yj**2 ) + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Original index of the uppper and lower cell + ir_lower = int(math.floor( r_cell )) + iz_lower = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_lower < 0: + ir_lower = 0 + # absorbing in upper r + if ir_lower > Nr-1: + ir_lower = Nr-1 + # periodic boundaries in z + if iz_lower < 0: + iz_lower += Nz + if iz_lower > Nz-1: + iz_lower -= Nz + + # Reset sorted_idx array + sorted_idx[i] = i + # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr + cell_idx[i] = ir_lower + iz_lower * Nr + +def sort_particles_per_cell(cell_idx, sorted_idx): + """ + Sort the cell index of the particles and + modify the sorted index array accordingly. + + Parameters + ---------- + cell_idx : 1darray of integers + The cell index of the particle + + sorted_idx : 1darray of integers + Represents the original index of the + particle before the sorting. + """ + Ntot = cell_idx.shape[0] + if Ntot > 0: + sorter = sorting.RadixSort(Ntot, dtype = np.int32) + sorter.sort(cell_idx, vals = sorted_idx) + +@cuda.jit('void(int32[:], int32[:])') +def incl_prefix_sum(cell_idx, prefix_sum): + """ + Perform an inclusive parallel prefix sum on the sorted + cell index array. The prefix sum array represents the + cumulative sum of the number of particles per cell + for each cell index. + + Parameters + ---------- + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # i is the index of the macroparticle + i = cuda.grid(1) + if i < cell_idx.shape[0]-1: + # ci: index of the cell of the present macroparticle + ci = cell_idx[i] + # ci_next: index of the cell of the next macroparticle + ci_next = cell_idx[i+1] + # Fill all the cells between ci and ci_next with the + # inclusive cumulative sum of the number particles until ci + while ci < ci_next: + # The cumulative sum of the number of particle per cell + # until ci is i+1 (since i obeys python index, starting at 0) + prefix_sum[ci] = i+1 + ci += 1 + # The last "macroparticle" of the cell_idx array fills up the + # rest of the prefix sum array + if i == cell_idx.shape[0]-1: + # Get the cell_index of the last macroparticle + ci = cell_idx[i] + # Fill all the remaining entries of the prefix sum array + for empty_index in range(ci, prefix_sum.shape[0]): + prefix_sum[empty_index] = i+1 + +@cuda.jit('void(int32[:])') +def reset_prefix_sum(prefix_sum): + """ + Resets the prefix sum. Sets all the values + to zero. + + Parameters + ---------- + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + i = cuda.grid(1) + if i < prefix_sum.shape[0]: + prefix_sum[i] = 0 + +@cuda.jit('void(uint32[:], float64[:], float64[:])') +def write_sorting_buffer(sorted_idx, val, buf): + """ + Writes the values of a particle array to a buffer, + while rearranging them to match the sorted cell index array. + + Parameters + ---------- + sorted_idx : 1darray of integers + Represents the original index of the + particle before the sorting + + val : 1d array of floats + A particle data array + + buf : 1d array of floats + A buffer array to temporarily store the + sorted particle data array + """ + i = cuda.grid(1) + if i < val.shape[0]: + buf[i] = val[sorted_idx[i]] diff --git a/fbpic/particles/utility_methods.py b/fbpic/particles/utilities/utility_methods.py similarity index 98% rename from fbpic/particles/utility_methods.py rename to fbpic/particles/utilities/utility_methods.py index d0de6c5b8..d79019f0d 100644 --- a/fbpic/particles/utility_methods.py +++ b/fbpic/particles/utilities/utility_methods.py @@ -3,7 +3,7 @@ # License: 3-Clause-BSD-LBNL """ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the optimized particles methods that use numba on a CPU +It defines particle utility methods. """ import numpy as np diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py new file mode 100644 index 000000000..a35d9a8c5 --- /dev/null +++ b/fbpic/threading_utils.py @@ -0,0 +1,39 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines a set of generic functions for multithreaded CPU execution. +""" +import os +from numba import njit + +# By default threading is enabled +threading_enabled = True + +# Check if the environment variable FBPIC_DISABLE_THREADING is set to 1 +# and in that case, disable threading +if 'FBPIC_DISABLE_THREADING' in os.environ: + if int(os.environ['FBPIC_DISABLE_THREADING']) == 1: + threading_enabled = False + +# If the user request threading (by not setting FBPIC_DISABLE_THREADING) +# check if it is indeed installed +if threading_enabled: + try: + # Try to import the threading function prange + from numba import prange as numba_prange + except ImportError: + threading_enabled = False + print('*** Threading not available for the simulation.') + print('*** (Please make sure that numba>0.34 is installed)') + +# Set the function njit_parallel and prange to the correct object +if not threading_enabled: + # Use regular serial compilation function + njit_parallel = njit + prange = range +else: + # Use the parallel compilation function + njit_parallel = njit( parallel=True ) + prange = numba_prange diff --git a/tests/test_uniform_rho_deposition.py b/tests/test_uniform_rho_deposition.py index e0a7978e6..8978f9625 100644 --- a/tests/test_uniform_rho_deposition.py +++ b/tests/test_uniform_rho_deposition.py @@ -10,7 +10,7 @@ (i.e. this confirms that no Verboncoeur-type correction is needed) - Shifting this plasma by a small amount in r, and still verifying that the deposited density is uniform - The tests are performed with different particle shapes: linear + The tests are performed with different particle shapes: linear, cubic Usage : from the top-level directory of FBPIC run @@ -47,7 +47,7 @@ def test_uniform_electron_plasma(show=False): "Function that is run by py.test, when doing `python setup.py test`" - for shape in ['cubic', 'linear', 'linear_non_atomic']: + for shape in ['linear', 'cubic']: uniform_electron_plasma( shape, show ) def uniform_electron_plasma(shape, show=False): @@ -87,7 +87,7 @@ def uniform_electron_plasma(shape, show=False): def test_neutral_plasma_shifted(show=False): "Function that is run by py.test, when doing `python setup.py test`" - for shape in ['cubic', 'linear', 'linear_non_atomic']: + for shape in ['linear', 'cubic']: neutral_plasma_shifted( shape, show ) def neutral_plasma_shifted(shape, show=False):