From 9ae48f399068c53e3fef8e1f76c4b534edc25ec4 Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Fri, 14 Jul 2017 18:51:03 +0200 Subject: [PATCH 01/36] Initial CPU multi-threading implementation --- fbpic/main.py | 51 +- fbpic/particles/cuda_deposition/__init__.py | 0 fbpic/particles/cuda_deposition/cubic.py | 1214 ----------------- fbpic/particles/cuda_deposition/linear.py | 482 ------- .../cuda_deposition/linear_non_atomic.py | 671 --------- fbpic/particles/cuda_methods.py | 997 -------------- fbpic/particles/numba_methods.py | 251 ---- fbpic/particles/particles.py | 285 +++- fbpic/particles/utility_methods.py | 155 --- 9 files changed, 247 insertions(+), 3859 deletions(-) delete mode 100644 fbpic/particles/cuda_deposition/__init__.py delete mode 100644 fbpic/particles/cuda_deposition/cubic.py delete mode 100644 fbpic/particles/cuda_deposition/linear.py delete mode 100644 fbpic/particles/cuda_deposition/linear_non_atomic.py delete mode 100644 fbpic/particles/cuda_methods.py delete mode 100644 fbpic/particles/numba_methods.py delete mode 100644 fbpic/particles/utility_methods.py diff --git a/fbpic/main.py b/fbpic/main.py index f508b0f69..f33554966 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -10,6 +10,7 @@ # (This needs to be done before the other imports, # as it sets the cuda context) from mpi4py import MPI +import numba # Check if CUDA is available, then import CUDA functions from .cuda_utils import cuda_installed if cuda_installed: @@ -41,12 +42,11 @@ class Simulation(object): def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax, p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0., n_order=-1, dens_func=None, filter_currents=True, - v_comoving=None, use_galilean=False, - initialize_ions=False, use_cuda=False, - n_guard=None, n_damp=30, - exchange_period=None, boundaries='periodic', - gamma_boost=None, use_all_mpi_ranks=True, - particle_shape='linear' ): + v_comoving=None, use_galilean=False, initialize_ions=False, + use_cuda=False, use_threading=True, nthreads=None, + n_guard=None, n_damp=30, exchange_period=None, + boundaries='periodic', gamma_boost=None, + use_all_mpi_ranks=True, particle_shape='linear' ): """ Initializes a simulation, by creating the following structures: @@ -132,6 +132,12 @@ def dens_func( z, r ) ... use_cuda: bool, optional Wether to use CUDA (GPU) acceleration + use_threading : bool, optional + Wether to use multi-threading on the CPU. + nthreads: int, optional + Number of CPU multi-threading threads used (if use_threading + is set). If nthreads is set to None, the number of threads + are automatically determined. n_guard: int, optional Number of guard cells to use at the left and right of @@ -186,13 +192,23 @@ def dens_func( z, r ) ... to first order shapes, 'linear_non_atomic' uses an equivalent deposition scheme to 'linear' which avoids atomics on the GPU. """ - # Check whether to use cuda + # Check whether to use CUDA self.use_cuda = use_cuda if (use_cuda==True) and (cuda_installed==False): print('*** Cuda not available for the simulation.') print('*** Performing the simulation on CPU.') self.use_cuda = False - + # CPU multi-threading + self.use_threading = use_threading + if self.use_threading: + # Define number of threads used + if nthreads is not None: + # Automatically take numba preset for number of threads + self.nthreads = nthreads + numba.config.NUMBA_NUM_THREADS = self.nthreads + else: + # Set user-defined number of threads + self.nthreads = numba.config.NUMBA_NUM_THREADS # Register the comoving parameters self.v_comoving = v_comoving self.use_galilean = use_galilean @@ -234,19 +250,20 @@ def dens_func( z, r ) ... # Initialize the electrons and the ions grid_shape = self.fld.interp[0].Ez.shape self.ptcl = [ - Particles( q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin, - zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, - Nptheta=p_nt, dt=dt, dens_func=dens_func, - use_cuda=self.use_cuda, uz_m=uz_m, - grid_shape=grid_shape, particle_shape=particle_shape) ] + Particles(q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin, + zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, + Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m, + grid_shape=grid_shape, particle_shape=particle_shape, + use_cuda=self.use_cuda, + use_threading=self.use_threading) ] if initialize_ions : self.ptcl.append( Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin, zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, - Nptheta=p_nt, dt=dt, dens_func=dens_func, - use_cuda=self.use_cuda, uz_m=uz_m, - grid_shape=grid_shape, - particle_shape=particle_shape ) ) + Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m, + grid_shape=grid_shape, particle_shape=particle_shape, + use_cuda=self.use_cuda, + use_threading=self.use_threading) ) # Register the number of particles per cell along z, and dt # (Necessary for the moving window) diff --git a/fbpic/particles/cuda_deposition/__init__.py b/fbpic/particles/cuda_deposition/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/fbpic/particles/cuda_deposition/cubic.py b/fbpic/particles/cuda_deposition/cubic.py deleted file mode 100644 index 45e15e0eb..000000000 --- a/fbpic/particles/cuda_deposition/cubic.py +++ /dev/null @@ -1,1214 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for cubic order shapes -""" -from numba import cuda, int64 -import math -from scipy.constants import c -import numpy as np - -# Shape Factor helper functions to compute particle shapes. - - -@cuda.jit(device=True, inline=True) -def z_shape(cell_position, index): - iz = int64(math.floor(cell_position)) - 1 - if index == 0: - return (-1./6.)*((cell_position-iz)-2)**3 - if index == 1: - return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) - if index == 2: - return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) - if index == 3: - return (-1./6.)*(((iz+3)-cell_position)-2)**3 - - -@cuda.jit(device=True, inline=True) -def r_shape(cell_position, index): - flip_factor = 1. - ir = int64(math.floor(cell_position)) - 1 - if index == 0: - if ir < 0: - flip_factor = -1. - return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 - if index == 1: - if ir+1 < 0: - flip_factor = -1. - return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) - if index == 2: - if ir+2 < 0: - flip_factor = -1. - return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) - if index == 3: - if ir+3 < 0: - flip_factor = -1. - return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 - -# ------------------------------- -# Field deposition utility - rho -# ------------------------------- - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - int32[:], int32[:])') -def deposit_rho_gpu_cubic(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho_m0, rho_m1, - cell_idx, prefix_sum): - """ - Deposition of the charge density rho using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of rho that is deposited to the - 16 cells surounding the particle based on its shape (cubic). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 16 variables (one for each surrounding cell) to - maintain parallelism while avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i / Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare local field arrays - R_m0_00 = 0. - R_m1_00 = 0. + 0.j - - R_m0_01 = 0. - R_m1_01 = 0. + 0.j - - R_m0_02 = 0. - R_m1_02 = 0. + 0.j - - R_m0_03 = 0. - R_m1_03 = 0. + 0.j - - R_m0_10 = 0. - R_m1_10 = 0. + 0.j - - R_m0_11 = 0. - R_m1_11 = 0. + 0.j - - R_m0_12 = 0. - R_m1_12 = 0. + 0.j - - R_m0_13 = 0. - R_m1_13 = 0. + 0.j - - R_m0_20 = 0. - R_m1_20 = 0. + 0.j - - R_m0_21 = 0. - R_m1_21 = 0. + 0.j - - R_m0_22 = 0. - R_m1_22 = 0. + 0.j - - R_m0_23 = 0. - R_m1_23 = 0. + 0.j - - R_m0_30 = 0. - R_m1_30 = 0. + 0.j - - R_m0_31 = 0. - R_m1_31 = 0. + 0.j - - R_m0_32 = 0. - R_m1_32 = 0. + 0.j - - R_m0_33 = 0. - R_m1_33 = 0. + 0.j - - for j in range(frequency_per_cell): - # Get the particle index before the sorting - # -------------------------------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0_scal = wj * exptheta_m0 - # Mode 1 - R_m1_scal = wj * exptheta_m1 - # Compute values in local copies and consider boundaries - ir0 = int64(math.floor(r_cell)) - 1 - - if (ir0 == -2): - R_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal - - if (ir0 == -1): - R_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal - R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal - R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal - R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal - R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal - R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal - R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal - R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal - if (ir0 >= 0): - R_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal - R_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal - R_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal - R_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal - R_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal - R_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal - R_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal - R_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal - - R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal - R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal - R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal - R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal - R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal - R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal - R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal - R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal - - # Index Shifting since local copies are centered around - # the current cell - srl = 0 # shift r lower - sru = 0 # shift r upper inner - sru2 = 0 # shift r upper outer - szl = 0 # shift z lower - szu = 0 # shift z upper inner - szu2 = 0 # shift z upper outer - if (iz_cell-1) < 0: - szl += Nz - if (iz_cell) == (Nz - 1): - szu -= Nz - szu2 -= Nz - if (iz_cell+1) == (Nz - 1): - szu2 -= Nz - if (ir_cell) >= (Nr - 1): - sru = -1 - sru2 = -2 - if (ir_cell+1) == (Nr - 1): - sru2 = -1 - if (ir_cell-1) < 0: - srl = 1 - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real) - cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real) - cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real) - cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real) - cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag) - - -# ------------------------------- -# Field deposition utility - J -# ------------------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:],\ - int32[:], int32[:])') -def deposit_J_gpu_cubic(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - j_r_m0, j_r_m1, - j_t_m0, j_t_m1, - j_z_m0, j_z_m1, - cell_idx, prefix_sum): - """ - Deposition of the current J using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of J that is deposited to the - 16 cells surounding the particle based on its shape (cubic). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 16 variables (one for each cell) to maintain - parallelism while avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i/Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare the local field value for - # all possible deposition directions, - # depending on the shape order and per mode for r,t and z. - J_r_m0_00 = 0. - J_t_m0_00 = 0. - J_z_m0_00 = 0. - J_r_m1_00 = 0. + 0.j - J_t_m1_00 = 0. + 0.j - J_z_m1_00 = 0. + 0.j - - J_r_m0_01 = 0. - J_t_m0_01 = 0. - J_z_m0_01 = 0. - J_r_m1_01 = 0. + 0.j - J_t_m1_01 = 0. + 0.j - J_z_m1_01 = 0. + 0.j - - J_r_m0_02 = 0. - J_t_m0_02 = 0. - J_z_m0_02 = 0. - J_r_m1_02 = 0. + 0.j - J_t_m1_02 = 0. + 0.j - J_z_m1_02 = 0. + 0.j - - J_r_m0_03 = 0. - J_t_m0_03 = 0. - J_z_m0_03 = 0. - J_r_m1_03 = 0. + 0.j - J_t_m1_03 = 0. + 0.j - J_z_m1_03 = 0. + 0.j - - J_r_m0_10 = 0. - J_t_m0_10 = 0. - J_z_m0_10 = 0. - J_r_m1_10 = 0. + 0.j - J_t_m1_10 = 0. + 0.j - J_z_m1_10 = 0. + 0.j - - J_r_m0_11 = 0. - J_t_m0_11 = 0. - J_z_m0_11 = 0. - J_r_m1_11 = 0. + 0.j - J_t_m1_11 = 0. + 0.j - J_z_m1_11 = 0. + 0.j - - J_r_m0_12 = 0. - J_t_m0_12 = 0. - J_z_m0_12 = 0. - J_r_m1_12 = 0. + 0.j - J_t_m1_12 = 0. + 0.j - J_z_m1_12 = 0. + 0.j - - J_r_m0_13 = 0. - J_t_m0_13 = 0. - J_z_m0_13 = 0. - J_r_m1_13 = 0. + 0.j - J_t_m1_13 = 0. + 0.j - J_z_m1_13 = 0. + 0.j - - J_r_m0_20 = 0. - J_t_m0_20 = 0. - J_z_m0_20 = 0. - J_r_m1_20 = 0. + 0.j - J_t_m1_20 = 0. + 0.j - J_z_m1_20 = 0. + 0.j - - J_r_m0_21 = 0. - J_t_m0_21 = 0. - J_z_m0_21 = 0. - J_r_m1_21 = 0. + 0.j - J_t_m1_21 = 0. + 0.j - J_z_m1_21 = 0. + 0.j - - J_r_m0_22 = 0. - J_t_m0_22 = 0. - J_z_m0_22 = 0. - J_r_m1_22 = 0. + 0.j - J_t_m1_22 = 0. + 0.j - J_z_m1_22 = 0. + 0.j - - J_r_m0_23 = 0. - J_t_m0_23 = 0. - J_z_m0_23 = 0. - J_r_m1_23 = 0. + 0.j - J_t_m1_23 = 0. + 0.j - J_z_m1_23 = 0. + 0.j - - J_r_m0_30 = 0. - J_t_m0_30 = 0. - J_z_m0_30 = 0. - J_r_m1_30 = 0. + 0.j - J_t_m1_30 = 0. + 0.j - J_z_m1_30 = 0. + 0.j - - J_r_m0_31 = 0. - J_t_m0_31 = 0. - J_z_m0_31 = 0. - J_r_m1_31 = 0. + 0.j - J_t_m1_31 = 0. + 0.j - J_z_m1_31 = 0. + 0.j - - J_r_m0_32 = 0. - J_t_m0_32 = 0. - J_z_m0_32 = 0. - J_r_m1_32 = 0. + 0.j - J_t_m1_32 = 0. + 0.j - J_z_m1_32 = 0. + 0.j - - J_r_m0_33 = 0. - J_t_m0_33 = 0. - J_z_m0_33 = 0. - J_r_m1_33 = 0. + 0.j - J_t_m1_33 = 0. + 0.j - J_z_m1_33 = 0. + 0.j - - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index - # ---------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 - J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 - J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 - # Mode 1 - J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 - J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 - J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 - - # Compute values in local copies and consider boundaries - ir0 = int64(math.floor(r_cell)) - 1 - - if (ir0 == -2): - J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal - J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - - J_t_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal - - J_z_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal - if (ir0 == -1): - J_r_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal - - J_t_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal - - J_z_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal - if (ir0 >= 0): - J_r_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal - - J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal - J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal - J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal - J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal - J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal - J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal - J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal - J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal - - J_t_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal - - J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal - J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal - J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal - J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal - J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal - J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal - J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal - J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal - - J_z_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal - - J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal - J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal - J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal - J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal - J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal - J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal - J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal - J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal - - # Index Shifting since local copies are centered around - # the current cell - srl = 0 # shift r lower - sru = 0 # shift r upper inner - sru2 = 0 # shift r upper outer - szl = 0 # shift z lower - szu = 0 # shift z upper inner - szu2 = 0 # shift z upper outer - if (iz_cell-1) < 0: - szl += Nz - if (iz_cell) == (Nz - 1): - szu -= Nz - szu2 -= Nz - if (iz_cell+1) == (Nz - 1): - szu2 -= Nz - if (ir_cell) >= (Nr - 1): - sru = -1 - sru2 = -2 - if (ir_cell+1) == (Nr - 1): - sru2 = -1 - if (ir_cell-1) < 0: - srl = 1 - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30) - cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32) - cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33) - cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30) - cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32) - cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33) - cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag) - - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30) - cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32) - cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33) - cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag) diff --git a/fbpic/particles/cuda_deposition/linear.py b/fbpic/particles/cuda_deposition/linear.py deleted file mode 100644 index 48adff845..000000000 --- a/fbpic/particles/cuda_deposition/linear.py +++ /dev/null @@ -1,482 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for linear order shapes -""" -from numba import cuda, int64 -import math -from scipy.constants import c -import numpy as np - - -# Shape Factor functions to compute particle shapes. -@cuda.jit(device=True, inline=True) -def get_z_shape_linear(cell_position, index): - iz = int64(math.floor(cell_position)) - if index == 0: - return iz+1.-cell_position - if index == 1: - return cell_position - iz - - -@cuda.jit(device=True, inline=True) -def get_r_shape_linear(cell_position, index): - flip_factor = 1. - ir = int64(math.floor(cell_position)) - if index == 0: - if ir < 0: - flip_factor = -1. - return flip_factor*(ir+1.-cell_position) - if index == 1: - return flip_factor*(cell_position - ir) - - -# ------------------------------- -# Field deposition utility - rho -# ------------------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - int32[:], int32[:])') -def deposit_rho_gpu_linear(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho_m0, rho_m1, - cell_idx, prefix_sum): - """ - Deposition of the charge density rho using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of rho that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 variables (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i / Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare local field arrays - R_m0_00 = 0. - R_m0_01 = 0. - R_m0_10 = 0. - R_m0_11 = 0. - - R_m1_00 = 0. + 0.j - R_m1_01 = 0. + 0.j - R_m1_10 = 0. + 0.j - R_m1_11 = 0. + 0.j - - for j in range(frequency_per_cell): - # Get the particle index before the sorting - # -------------------------------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0_scal = wj * exptheta_m0 - # Mode 1 - R_m1_scal = wj * exptheta_m1 - - # Boundary Region Shifts - ir_lower = int64(math.floor(r_cell)) - - R_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m1_scal - - if ir_lower == -1: - R_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal - else: - R_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal - - # Cell shifts for the simulation boundaries - shift_r = 0 - shift_z = 0 - if ir_cell+1 > (Nr-1): - shift_r = -1 - if iz_cell+1 > Nz-1: - shift_z -= Nz - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real) - cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real) - cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real) - cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real) - cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag) - - cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real) - cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real) - cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag) - - -# ------------------------------- -# Field deposition utility - J -# ------------------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:],\ - int32[:], int32[:])') -def deposit_J_gpu_linear(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - j_r_m0, j_r_m1, - j_t_m0, j_t_m1, - j_z_m0, j_z_m1, - cell_idx, prefix_sum): - """ - Deposition of the current J using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of J that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 variables (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz_cell = int(i/Nr) - ir_cell = int(i - iz_cell * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - - # Declare the local field value for - # all possible deposition directions, - # depending on the shape order and per mode for r,t and z. - - J_r_m0_00 = 0. - J_r_m1_00 = 0. + 0.j - J_t_m0_00 = 0.# + 0.j - J_t_m1_00 = 0. + 0.j - J_z_m0_00 = 0. - J_z_m1_00 = 0. + 0.j - - J_r_m0_01 = 0. - J_r_m1_01 = 0. + 0.j - J_t_m0_01 = 0. - J_t_m1_01 = 0. + 0.j - J_z_m0_01 = 0. - J_z_m1_01 = 0. + 0.j - - J_r_m0_10 = 0. - J_r_m1_10 = 0. + 0.j - J_t_m0_10 = 0. - J_t_m1_10 = 0. + 0.j - J_z_m0_10 = 0. - J_z_m1_10 = 0. + 0.j - - J_r_m0_11 = 0. - J_r_m1_11 = 0. + 0.j - J_t_m0_11 = 0. - J_t_m1_11 = 0. + 0.j - J_z_m0_11 = 0. - J_z_m1_11 = 0. + 0.j - - - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index - # ---------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset-1-j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 - J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 - J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 - # Mode 1 - J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 - J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 - J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 - - # Deposit on local copies at respective position - ir_lower = int64(math.floor(r_cell)) - - J_r_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal - - # Take into account lower r flips - if ir_lower == -1: - J_r_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal - else: - J_r_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal - - # Cell shifts for the simulation boundaries - shift_r = 0 - shift_z = 0 - if (ir_cell+1) > (Nr-1): - shift_r = -1 - if (iz_cell+1) > Nz-1: - shift_z -= Nz - - # Atomically add the registers to global memory - if frequency_per_cell > 0: - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real) - cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real) - cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag) - - cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real) - cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real) - cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real) - cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real) - cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag) - - cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real) - cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real) - cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real) - cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real) - cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag) - - cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real) - cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real) - cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag) diff --git a/fbpic/particles/cuda_deposition/linear_non_atomic.py b/fbpic/particles/cuda_deposition/linear_non_atomic.py deleted file mode 100644 index 6dea16d21..000000000 --- a/fbpic/particles/cuda_deposition/linear_non_atomic.py +++ /dev/null @@ -1,671 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for linear order shapes -without using atomic operations. -""" -from numba import cuda -import math -from scipy.constants import c -import numpy as np - -# ------------------------------- -# Field deposition utility - rho -# ------------------------------- - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:],\ - int32[:], int32[:])') -def deposit_rho_gpu(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho0, rho1, - rho2, rho3, - cell_idx, prefix_sum): - """ - Deposition of the charge density rho using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of rho that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 arrays (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho0, rho1, rho2, rho3 : 3darray of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz = int(i / Nr) - ir = int(i - iz * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - # Initialize the local field value for - # all four possible deposition directions - # Mode 0, 1 for r, t, z - # 1 : lower in r, lower in z - # 2 : lower in r, upper in z - # 3 : upper in r, lower in z - # 4 : upper in r, upper in z - R1_m0 = 0. + 0.j - R2_m0 = 0. + 0.j - R3_m0 = 0. + 0.j - R4_m0 = 0. + 0.j - # ------------ - R1_m1 = 0. + 0.j - R2_m1 = 0. + 0.j - R3_m1 = 0. + 0.j - R4_m1 = 0. + 0.j - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index before the sorting - # -------------------------------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset - 1 - j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1. / rj - cos = xj * invr # Cosine - sin = yj * invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j * sin - - # Get linear weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr * (rj - rmin) - 0.5 - z_cell = invdz * (zj - zmin) - 0.5 - # Original index of the uppper and lower cell - ir_lower = int(math.floor(r_cell)) - ir_upper = ir_lower + 1 - iz_lower = int(math.floor(z_cell)) - iz_upper = iz_lower + 1 - # Linear weight - Sr_lower = ir_upper - r_cell - Sr_upper = r_cell - ir_lower - Sz_lower = iz_upper - z_cell - Sz_upper = z_cell - iz_lower - # Set guard weights to zero - Sr_guard = 0. - - # Treat the boundary conditions - # -------------------------------------------- - # guard cells in lower r - if ir_lower < 0: - Sr_guard = Sr_lower - Sr_lower = 0. - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr - 1: - ir_lower = Nr - 1 - if ir_upper > Nr - 1: - ir_upper = Nr - 1 - # periodic boundaries in z - # lower z boundaries - if iz_lower < 0: - iz_lower += Nz - if iz_upper < 0: - iz_upper += Nz - # upper z boundaries - if iz_lower > Nz - 1: - iz_lower -= Nz - if iz_upper > Nz - 1: - iz_upper -= Nz - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0 = wj * exptheta_m0 - # Mode 1 - R_m1 = wj * exptheta_m1 - - # Caculate the weighted currents for each - # of the four possible direction - # -------------------------------------------- - if ir_lower == ir_upper: - # In the case that ir_lower and ir_upper are equal, - # the current is added only to the array corresponding - # to ir_lower. - # (This is the case for the boundaries in r) - R1_m0 += Sz_lower * Sr_lower * R_m0 - R1_m0 += Sz_lower * Sr_upper * R_m0 - R3_m0 += Sz_upper * Sr_lower * R_m0 - R3_m0 += Sz_upper * Sr_upper * R_m0 - # ----------------------------- - R1_m1 += Sz_lower * Sr_lower * R_m1 - R1_m1 += Sz_lower * Sr_upper * R_m1 - R3_m1 += Sz_upper * Sr_lower * R_m1 - R3_m1 += Sz_upper * Sr_upper * R_m1 - # ----------------------------- - if ir_lower != ir_upper: - # In the case that ir_lower and ir_upper are different, - # add the current to the four arrays according to - # the direction. - R1_m0 += Sz_lower * Sr_lower * R_m0 - R2_m0 += Sz_lower * Sr_upper * R_m0 - R3_m0 += Sz_upper * Sr_lower * R_m0 - R4_m0 += Sz_upper * Sr_upper * R_m0 - # ----------------------------- - R1_m1 += Sz_lower * Sr_lower * R_m1 - R2_m1 += Sz_lower * Sr_upper * R_m1 - R3_m1 += Sz_upper * Sr_lower * R_m1 - R4_m1 += Sz_upper * Sr_upper * R_m1 - # ----------------------------- - if ir_lower == ir_upper == 0: - # Treat the guard cells. - # Add the current to the guard cells - # for particles that had an original - # cell index < 0. - R1_m0 += -1. * Sz_lower * Sr_guard * R_m0 - R3_m0 += -1. * Sz_upper * Sr_guard * R_m0 - # --------------------------------- - R1_m1 += -1. * Sz_lower * Sr_guard * R_m1 - R3_m1 += -1. * Sz_upper * Sr_guard * R_m1 - # Write the calculated field values to - # the field arrays defined on the interpolation grid - rho0[iz, ir, 0] = R1_m0 - rho0[iz, ir, 1] = R1_m1 - rho1[iz, ir, 0] = R2_m0 - rho1[iz, ir, 1] = R2_m1 - rho2[iz, ir, 0] = R3_m0 - rho2[iz, ir, 1] = R3_m1 - rho3[iz, ir, 0] = R4_m0 - rho3[iz, ir, 1] = R4_m1 - - -@cuda.jit('void(complex128[:,:], complex128[:,:], \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:])') -def add_rho(rho_m0, rho_m1, - rho0, rho1, - rho2, rho3): - """ - Merges the 4 separate field arrays that contain rho for - each deposition direction and adds them to the global - interpolation grid arrays for mode 0 and 1. - - Parameters - ---------- - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) - - rho0, rho1, rho2, rho3 : 3darrays of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes. - """ - # Get the CUDA Grid in 2D - i, j = cuda.grid(2) - # Only for threads within (nz, nr) - if (i < rho_m0.shape[0] and j < rho_m0.shape[1]): - # Sum the four field arrays for the different deposition - # directions and write them to the global field array - rho_m0[i, j] += rho0[i, j, 0] + \ - rho1[i, j - 1, 0] + \ - rho2[i - 1, j, 0] + \ - rho3[i - 1, j - 1, 0] - - rho_m1[i, j] += rho0[i, j, 1] + \ - rho1[i, j - 1, 1] + \ - rho2[i - 1, j, 1] + \ - rho3[i - 1, j - 1, 1] - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:],\ - int32[:], int32[:])') -def deposit_J_gpu(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - J0, J1, - J2, J3, - cell_idx, prefix_sum): - """ - Deposition of the current J using numba on the GPU. - Iterates over the cells and over the particles per cell. - Calculates the weighted amount of J that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The particles are sorted by their cell index (the lower cell - in r and z that they deposit to) and the deposited field - is split into 4 arrays (one for each possible direction, - e.g. upper in z, lower in r) to maintain parallelism while - avoiding any race conditions. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - J0, J1, J2, J3 : 3darray of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes and the - 3 directions of J in cylindrical coordinates (r, t, z). - (is mofidied by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel (for threads < number of cells) - if i < prefix_sum.shape[0]: - # Calculate the cell index in 2D from the 1D threadIdx - iz = int(i / Nr) - ir = int(i - iz * Nr) - # Calculate the inclusive offset for the current cell - # It represents the number of particles contained in all other cells - # with an index smaller than i + the total number of particles in the - # current cell (inclusive). - incl_offset = np.int32(prefix_sum[i]) - # Calculate the frequency per cell from the offset and the previous - # offset (prefix_sum[i-1]). - if i > 0: - frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) - if i == 0: - frequency_per_cell = np.int32(incl_offset) - # Initialize the local field value for - # all four possible deposition directions - # Mode 0, 1 for r, t, z - # 1 : lower in r, lower in z - # 2 : lower in r, upper in z - # 3 : upper in r, lower in z - # 4 : upper in r, upper in z - Jr1_m0 = 0. + 0.j - Jr2_m0 = 0. + 0.j - Jr3_m0 = 0. + 0.j - Jr4_m0 = 0. + 0.j - # ------------- - Jr1_m1 = 0. + 0.j - Jr2_m1 = 0. + 0.j - Jr3_m1 = 0. + 0.j - Jr4_m1 = 0. + 0.j - # ------------- - Jt1_m0 = 0. + 0.j - Jt2_m0 = 0. + 0.j - Jt3_m0 = 0. + 0.j - Jt4_m0 = 0. + 0.j - # ------------- - Jt1_m1 = 0. + 0.j - Jt2_m1 = 0. + 0.j - Jt3_m1 = 0. + 0.j - Jt4_m1 = 0. + 0.j - # ------------- - Jz1_m0 = 0. + 0.j - Jz2_m0 = 0. + 0.j - Jz3_m0 = 0. + 0.j - Jz4_m0 = 0. + 0.j - # ------------- - Jz1_m1 = 0. + 0.j - Jz2_m1 = 0. + 0.j - Jz3_m1 = 0. + 0.j - Jz4_m1 = 0. + 0.j - # Loop over the number of particles per cell - for j in range(frequency_per_cell): - # Get the particle index - # ---------------------- - # (Since incl_offset is a cumulative sum of particle number, - # and since python index starts at 0, one has to add -1) - ptcl_idx = incl_offset - 1 - j - - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1. / rj - cos = xj * invr # Cosine - sin = yj * invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j * sin - - # Get linear weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr * (rj - rmin) - 0.5 - z_cell = invdz * (zj - zmin) - 0.5 - # Original index of the uppper and lower cell - # in r and z - ir_lower = int(math.floor(r_cell)) - ir_upper = ir_lower + 1 - iz_lower = int(math.floor(z_cell)) - iz_upper = iz_lower + 1 - # Linear weight - Sr_lower = ir_upper - r_cell - Sr_upper = r_cell - ir_lower - Sz_lower = iz_upper - z_cell - Sz_upper = z_cell - iz_lower - # Set guard weights to zero - Sr_guard = 0. - - # Treat the boundary conditions - # -------------------------------------------- - # guard cells in lower r - if ir_lower < 0: - Sr_guard = Sr_lower - Sr_lower = 0. - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr - 1: - ir_lower = Nr - 1 - if ir_upper > Nr - 1: - ir_upper = Nr - 1 - # periodic boundaries in z - # lower z boundaries - if iz_lower < 0: - iz_lower += Nz - if iz_upper < 0: - iz_upper += Nz - # upper z boundaries - if iz_lower > Nz - 1: - iz_lower -= Nz - if iz_upper > Nz - 1: - iz_upper -= Nz - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - Jr_m0 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m0 - Jt_m0 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m0 - Jz_m0 = wj * c * inv_gammaj * uzj * exptheta_m0 - # Mode 1 - Jr_m1 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m1 - Jt_m1 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m1 - Jz_m1 = wj * c * inv_gammaj * uzj * exptheta_m1 - - # Caculate the weighted currents for each - # of the four possible direction - # -------------------------------------------- - if ir_lower == ir_upper: - # In the case that ir_lower and ir_upper are equal, - # the current is added only to the array corresponding - # to ir_lower. - # (This is the case for the boundaries in r) - Jr1_m0 += Sz_lower * Sr_lower * Jr_m0 - Jr1_m0 += Sz_lower * Sr_upper * Jr_m0 - Jr3_m0 += Sz_upper * Sr_lower * Jr_m0 - Jr3_m0 += Sz_upper * Sr_upper * Jr_m0 - # ------------------------------- - Jr1_m1 += Sz_lower * Sr_lower * Jr_m1 - Jr1_m1 += Sz_lower * Sr_upper * Jr_m1 - Jr3_m1 += Sz_upper * Sr_lower * Jr_m1 - Jr3_m1 += Sz_upper * Sr_upper * Jr_m1 - # ------------------------------- - Jt1_m0 += Sz_lower * Sr_lower * Jt_m0 - Jt1_m0 += Sz_lower * Sr_upper * Jt_m0 - Jt3_m0 += Sz_upper * Sr_lower * Jt_m0 - Jt3_m0 += Sz_upper * Sr_upper * Jt_m0 - # ------------------------------- - Jt1_m1 += Sz_lower * Sr_lower * Jt_m1 - Jt1_m1 += Sz_lower * Sr_upper * Jt_m1 - Jt3_m1 += Sz_upper * Sr_lower * Jt_m1 - Jt3_m1 += Sz_upper * Sr_upper * Jt_m1 - # ------------------------------- - Jz1_m0 += Sz_lower * Sr_lower * Jz_m0 - Jz1_m0 += Sz_lower * Sr_upper * Jz_m0 - Jz3_m0 += Sz_upper * Sr_lower * Jz_m0 - Jz3_m0 += Sz_upper * Sr_upper * Jz_m0 - # ------------------------------- - Jz1_m1 += Sz_lower * Sr_lower * Jz_m1 - Jz1_m1 += Sz_lower * Sr_upper * Jz_m1 - Jz3_m1 += Sz_upper * Sr_lower * Jz_m1 - Jz3_m1 += Sz_upper * Sr_upper * Jz_m1 - # ------------------------------- - if ir_lower != ir_upper: - # In the case that ir_lower and ir_upper are different, - # add the current to the four arrays according to - # the direction. - Jr1_m0 += Sz_lower * Sr_lower * Jr_m0 - Jr2_m0 += Sz_lower * Sr_upper * Jr_m0 - Jr3_m0 += Sz_upper * Sr_lower * Jr_m0 - Jr4_m0 += Sz_upper * Sr_upper * Jr_m0 - # ------------------------------- - Jr1_m1 += Sz_lower * Sr_lower * Jr_m1 - Jr2_m1 += Sz_lower * Sr_upper * Jr_m1 - Jr3_m1 += Sz_upper * Sr_lower * Jr_m1 - Jr4_m1 += Sz_upper * Sr_upper * Jr_m1 - # ------------------------------- - Jt1_m0 += Sz_lower * Sr_lower * Jt_m0 - Jt2_m0 += Sz_lower * Sr_upper * Jt_m0 - Jt3_m0 += Sz_upper * Sr_lower * Jt_m0 - Jt4_m0 += Sz_upper * Sr_upper * Jt_m0 - # ------------------------------- - Jt1_m1 += Sz_lower * Sr_lower * Jt_m1 - Jt2_m1 += Sz_lower * Sr_upper * Jt_m1 - Jt3_m1 += Sz_upper * Sr_lower * Jt_m1 - Jt4_m1 += Sz_upper * Sr_upper * Jt_m1 - # ------------------------------- - Jz1_m0 += Sz_lower * Sr_lower * Jz_m0 - Jz2_m0 += Sz_lower * Sr_upper * Jz_m0 - Jz3_m0 += Sz_upper * Sr_lower * Jz_m0 - Jz4_m0 += Sz_upper * Sr_upper * Jz_m0 - # ------------------------------- - Jz1_m1 += Sz_lower * Sr_lower * Jz_m1 - Jz2_m1 += Sz_lower * Sr_upper * Jz_m1 - Jz3_m1 += Sz_upper * Sr_lower * Jz_m1 - Jz4_m1 += Sz_upper * Sr_upper * Jz_m1 - # ------------------------------- - if ir_lower == ir_upper == 0: - # Treat the guard cells. - # Add the current to the guard cells - # for particles that had an original - # cell index < 0. - Jr1_m0 += -1. * Sz_lower * Sr_guard * Jr_m0 - Jr3_m0 += -1. * Sz_upper * Sr_guard * Jr_m0 - # ----------------------------------- - Jr1_m1 += -1. * Sz_lower * Sr_guard * Jr_m1 - Jr3_m1 += -1. * Sz_upper * Sr_guard * Jr_m1 - # ----------------------------------- - Jt1_m0 += -1. * Sz_lower * Sr_guard * Jt_m0 - Jt3_m0 += -1. * Sz_upper * Sr_guard * Jt_m0 - # ----------------------------------- - Jt1_m1 += -1. * Sz_lower * Sr_guard * Jt_m1 - Jt3_m1 += -1. * Sz_upper * Sr_guard * Jt_m1 - # ----------------------------------- - Jz1_m0 += -1. * Sz_lower * Sr_guard * Jz_m0 - Jz3_m0 += -1. * Sz_upper * Sr_guard * Jz_m0 - # ----------------------------------- - Jz1_m1 += -1. * Sz_lower * Sr_guard * Jz_m1 - Jz3_m1 += -1. * Sz_upper * Sr_guard * Jz_m1 - # Write the calculated field values to - # the field arrays defined on the interpolation grid - J0[iz, ir, 0] = Jr1_m0 - J0[iz, ir, 1] = Jr1_m1 - J0[iz, ir, 2] = Jt1_m0 - J0[iz, ir, 3] = Jt1_m1 - J0[iz, ir, 4] = Jz1_m0 - J0[iz, ir, 5] = Jz1_m1 - # -------------------- - J1[iz, ir, 0] = Jr2_m0 - J1[iz, ir, 1] = Jr2_m1 - J1[iz, ir, 2] = Jt2_m0 - J1[iz, ir, 3] = Jt2_m1 - J1[iz, ir, 4] = Jz2_m0 - J1[iz, ir, 5] = Jz2_m1 - # -------------------- - J2[iz, ir, 0] = Jr3_m0 - J2[iz, ir, 1] = Jr3_m1 - J2[iz, ir, 2] = Jt3_m0 - J2[iz, ir, 3] = Jt3_m1 - J2[iz, ir, 4] = Jz3_m0 - J2[iz, ir, 5] = Jz3_m1 - # -------------------- - J3[iz, ir, 0] = Jr4_m0 - J3[iz, ir, 1] = Jr4_m1 - J3[iz, ir, 2] = Jt4_m0 - J3[iz, ir, 3] = Jt4_m1 - J3[iz, ir, 4] = Jz4_m0 - J3[iz, ir, 5] = Jz4_m1 - - -@cuda.jit('void(complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - complex128[:,:,:], complex128[:,:,:], \ - complex128[:,:,:], complex128[:,:,:])') -def add_J(Jr_m0, Jr_m1, - Jt_m0, Jt_m1, - Jz_m0, Jz_m1, - J0, J1, - J2, J3): - """ - Merges the 4 separate field arrays that contain J for - each deposition direction and adds them to the global - interpolation grid arrays for mode 0 and 1. - - Parameters - ---------- - Jr_m0, Jr_m1, Jt_m0, Jt_m1, Jz_m0, Jz_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - J0, J1, J2, J3 : 3darrays of complexs - 2d field arrays, one for each of the deposition directions - The third dimension contains the two possible modes and - the 3 different components of J (r, t, z). - """ - # Get the CUDA Grid in 2D - i, j = cuda.grid(2) - # Only for threads within (nz, nr) - if (i < Jr_m0.shape[0] and j < Jr_m0.shape[1]): - # Sum the four field arrays for the different deposition - # directions and write them to the global field array - Jr_m0[i, j] += J0[i, j, 0] + \ - J1[i, j - 1, 0] + \ - J2[i - 1, j, 0] + \ - J3[i - 1, j - 1, 0] - - Jr_m1[i, j] += J0[i, j, 1] + \ - J1[i, j - 1, 1] + \ - J2[i - 1, j, 1] + \ - J3[i - 1, j - 1, 1] - - Jt_m0[i, j] += J0[i, j, 2] + \ - J1[i, j - 1, 2] + \ - J2[i - 1, j, 2] + \ - J3[i - 1, j - 1, 2] - - Jt_m1[i, j] += J0[i, j, 3] + \ - J1[i, j - 1, 3] + \ - J2[i - 1, j, 3] + \ - J3[i - 1, j - 1, 3] - - Jz_m0[i, j] += J0[i, j, 4] + \ - J1[i, j - 1, 4] + \ - J2[i - 1, j, 4] + \ - J3[i - 1, j - 1, 4] - - Jz_m1[i, j] += J0[i, j, 5] + \ - J1[i, j - 1, 5] + \ - J2[i - 1, j, 5] + \ - J3[i - 1, j - 1, 5] diff --git a/fbpic/particles/cuda_methods.py b/fbpic/particles/cuda_methods.py deleted file mode 100644 index dedc1f2e7..000000000 --- a/fbpic/particles/cuda_methods.py +++ /dev/null @@ -1,997 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the optimized particles methods that use cuda on a GPU -""" -from numba import cuda, float64, int64 -from accelerate.cuda import sorting -import math -from scipy.constants import c, e -import numpy as np - -# ----------------------- -# Particle pusher utility -# ----------------------- - -@cuda.jit(device=True, inline=True) -def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, - Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): - """ - Push at single macroparticle, using the Vay pusher - """ - # Get the magnetic rotation vector - taux = bconst*Bx - tauy = bconst*By - tauz = bconst*Bz - tau2 = taux**2 + tauy**2 + tauz**2 - - # Get the momenta at the half timestep - uxp = ux_i + econst*Ex \ - + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) - uyp = uy_i + econst*Ey \ - + inv_gamma_i*( uz_i*taux - ux_i*tauz ) - uzp = uz_i + econst*Ez \ - + inv_gamma_i*( ux_i*tauy - uy_i*taux ) - sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 - utau = uxp*taux + uyp*tauy + uzp*tauz - - # Get the new 1./gamma - inv_gamma_f = math.sqrt( - 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) - - # Reuse the tau and utau arrays to save memory - tx = inv_gamma_f*taux - ty = inv_gamma_f*tauy - tz = inv_gamma_f*tauz - ut = inv_gamma_f*utau - s = 1./( 1 + tau2*inv_gamma_f**2 ) - - # Get the new u - ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) - uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) - uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) - - return( ux_f, uy_f, uz_f, inv_gamma_f ) - - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64, float64, int32, float64)') -def push_p_gpu( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, - q, m, Ntot, dt ) : - """ - Advance the particles' momenta, using cuda on the GPU - - Parameters - ---------- - ux, uy, uz : 1darray of floats - The velocity of the particles - (is modified by this function) - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - - q : float - The charge of the particle species - - m : float - The mass of the particle species - - Ntot : int - The total number of particles - - dt : float - The time by which the momenta is advanced - """ - # Set a few constants - econst = q*dt/(m*c) - bconst = 0.5*q*dt/m - - #Cuda 1D grid - ip = cuda.grid(1) - - # Loop over the particles - if ip < Ntot: - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) - -@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64, int32, float64, int16[:])') -def push_p_ioniz_gpu( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, - m, Ntot, dt, ionization_level ) : - """ - Advance the particles' momenta, using numba on the GPU - This take into account that the particles are ionizable, and thus - that their charge is determined by `ionization_level` - - Parameters - ---------- - ux, uy, uz : 1darray of floats - The velocity of the particles - (is modified by this function) - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - - m : float - The mass of the particle species - - Ntot : int - The total number of particles - - dt : float - The time by which the momenta is advanced - - ionization_level : 1darray of ints - The number of electrons that each ion is missing - (compared to a neutral atom) - """ - #Cuda 1D grid - ip = cuda.grid(1) - - # Loop over the particles - if ip < Ntot: - if ionization_level[ip] != 0: - # Set a few constants - econst = ionization_level[ip] * e * dt/(m*c) - bconst = 0.5 * ionization_level[ip] * e * dt/m - # Use the Vay pusher - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) - -@cuda.jit('void(float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64)') -def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) : - """ - Advance the particles' positions over one half-timestep - - This assumes that the positions (x, y, z) are initially either - one half-timestep *behind* the momenta (ux, uy, uz), or at the - same timestep as the momenta. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - (is modified by this function) - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - dt : float (seconds) - The time by which the position is advanced - """ - # Half timestep, multiplied by c - chdt = c*0.5*dt - - i = cuda.grid(1) - if i < x.shape[0]: - # Particle push - inv_g = inv_gamma[i] - x[i] += chdt*inv_g*ux[i] - y[i] += chdt*inv_g*uy[i] - z[i] += chdt*inv_g*uz[i] - -# ----------------------- -# Field gathering utility -# ----------------------- - -@cuda.jit('void(float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:])') -def gather_field_gpu_linear(x, y, z, - invdz, zmin, Nz, - invdr, rmin, Nr, - Er_m0, Et_m0, Ez_m0, - Er_m1, Et_m1, Ez_m1, - Br_m0, Bt_m0, Bz_m0, - Br_m1, Bt_m1, Bz_m1, - Ex, Ey, Ez, - Bx, By, Bz): - """ - Gathering of the fields (E and B) using numba on the GPU. - Iterates over the particles, calculates the weighted amount - of fields acting on each particle based on its shape (linear). - Fields are gathered in cylindrical coordinates and then - transformed to cartesian coordinates. - Supports only mode 0 and 1. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box along the - direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - Er_m0, Et_m0, Ez_m0 : 2darray of complexs - The electric fields on the interpolation grid for the mode 0 - - Er_m1, Et_m1, Ez_m1 : 2darray of complexs - The electric fields on the interpolation grid for the mode 1 - - Br_m0, Bt_m0, Bz_m0 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 0 - - Br_m1, Bt_m1, Bz_m1 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 1 - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - (is modified by this function) - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - (is modified by this function) - """ - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel - # (for threads < number of particles) - if i < x.shape[0]: - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[i] - yj = y[i] - zj = z[i] - - # Cylindrical conversion - rj = math.sqrt( xj**2 + yj**2 ) - if (rj !=0. ) : - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else : - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos - 1.j*sin - - # Get linear weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - # Original index of the uppper and lower cell - ir_lower = int(math.floor( r_cell )) - ir_upper = ir_lower + 1 - iz_lower = int(math.floor( z_cell )) - iz_upper = iz_lower + 1 - # Linear weight - Sr_lower = ir_upper - r_cell - Sr_upper = r_cell - ir_lower - Sz_lower = iz_upper - z_cell - Sz_upper = z_cell - iz_lower - # Set guard weights to zero - Sr_guard = 0. - - # Treat the boundary conditions - # -------------------------------------------- - # guard cells in lower r - if ir_lower < 0: - Sr_guard = Sr_lower - Sr_lower = 0. - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr-1: - ir_lower = Nr-1 - if ir_upper > Nr-1: - ir_upper = Nr-1 - # periodic boundaries in z - # lower z boundaries - if iz_lower < 0: - iz_lower += Nz - if iz_upper < 0: - iz_upper += Nz - # upper z boundaries - if iz_lower > Nz-1: - iz_lower -= Nz - if iz_upper > Nz-1: - iz_upper -= Nz - - #Precalculate Shapes - S_ll = Sz_lower*Sr_lower - S_lu = Sz_lower*Sr_upper - S_ul = Sz_upper*Sr_lower - S_uu = Sz_upper*Sr_upper - S_lg = Sz_lower*Sr_guard - S_ug = Sz_upper*Sr_guard - - # E-Field - # ---------------------------- - # Define the initial placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ] - Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ] - Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ] - Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ] - Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ] - Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ] - Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ] - Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ] - Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ] - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0] - Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0] - Fz_m += 1. * S_lg * Ez_m0[ iz_lower, 0] - # Upper cell in z - Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0] - Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0] - Fz_m += 1. * S_ug * Ez_m0[ iz_upper, 0] - # Add the fields from the mode 0 - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 1 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ] - Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ] - Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ] - Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ] - Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ] - Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ] - Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ] - Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ] - Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ] - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += 1. * S_lg * Er_m1[ iz_lower, 0] - Ft_m += 1. * S_lg * Et_m1[ iz_lower, 0] - Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0] - # Upper cell in z - Fr_m += 1. * S_ug * Er_m1[ iz_upper, 0] - Ft_m += 1. * S_ug * Et_m1[ iz_upper, 0] - Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0] - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Ex[i] = cos*Fr - sin*Ft - Ey[i] = sin*Fr + cos*Ft - Ez[i] = Fz - - # B-Field - # ---------------------------- - # Clear the placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ] - Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ] - Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ] - Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ] - Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ] - Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ] - Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ] - Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ] - Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ] - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0] - Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0] - Fz_m += 1. * S_lg * Bz_m0[ iz_lower, 0] - # Upper cell in z - Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0] - Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0] - Fz_m += 1. * S_ug * Bz_m0[ iz_upper, 0] - # Add the fields from the mode 0 - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 1 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ] - Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ] - Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ] - Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ] - Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ] - Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ] - Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ] - Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ] - Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ] - - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += 1. * S_lg * Br_m1[ iz_lower, 0] - Ft_m += 1. * S_lg * Bt_m1[ iz_lower, 0] - Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0] - # Upper cell in z - Fr_m += 1. * S_ug * Br_m1[ iz_upper, 0] - Ft_m += 1. * S_ug * Bt_m1[ iz_upper, 0] - Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0] - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Bx[i] = cos*Fr - sin*Ft - By[i] = sin*Fr + cos*Ft - Bz[i] = Fz - - -@cuda.jit('void(float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32, \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - float64[:], float64[:], float64[:], \ - float64[:], float64[:], float64[:])') -def gather_field_gpu_cubic(x, y, z, - invdz, zmin, Nz, - invdr, rmin, Nr, - Er_m0, Et_m0, Ez_m0, - Er_m1, Et_m1, Ez_m1, - Br_m0, Bt_m0, Bz_m0, - Br_m1, Bt_m1, Bz_m1, - Ex, Ey, Ez, - Bx, By, Bz): - """ - Gathering of the fields (E and B) using numba on the GPU. - Iterates over the particles, calculates the weighted amount - of fields acting on each particle based on its shape (cubic). - Fields are gathered in cylindrical coordinates and then - transformed to cartesian coordinates. - Supports only mode 0 and 1. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box along the - direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - Er_m0, Et_m0, Ez_m0 : 2darray of complexs - The electric fields on the interpolation grid for the mode 0 - - Er_m1, Et_m1, Ez_m1 : 2darray of complexs - The electric fields on the interpolation grid for the mode 1 - - Br_m0, Bt_m0, Bz_m0 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 0 - - Br_m1, Bt_m1, Bz_m1 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 1 - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - (is modified by this function) - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - (is modified by this function) - """ - - # Get the 1D CUDA grid - i = cuda.grid(1) - # Deposit the field per cell in parallel - # (for threads < number of particles) - if i < x.shape[0]: - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[i] - yj = y[i] - zj = z[i] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos - 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particle, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the shape factors - Sr = cuda.local.array((4,), dtype=float64) - ir = cuda.local.array((4,), dtype=int64) - ir[0] = int64(math.floor(r_cell)) - 1 - ir[1] = ir[0] + 1 - ir[2] = ir[1] + 1 - ir[3] = ir[2] + 1 - Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3 - Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) - Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) - Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 - iz = cuda.local.array((4,), dtype=int64) - Sz = cuda.local.array((4,), dtype=float64) - iz[0] = int64(math.floor(z_cell)) - 1 - iz[1] = iz[0] + 1 - iz[2] = iz[1] + 1 - iz[3] = iz[2] + 1 - Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3 - Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4) - Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) - Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 - # Lower and upper periodic boundary for z - for index_z in range(4): - if iz[index_z] < 0: - iz[index_z] += Nz - if iz[index_z] > Nz - 1: - iz[index_z] -= Nz - # Lower and upper boundary for r - for index_r in range(4): - if ir[index_r] < 0: - ir[index_r] = abs(ir[index_r])-1 - Sr[index_r] = (-1.)*Sr[index_r] - if ir[index_r] > Nr - 1: - ir[index_r] = Nr - 1 - - # E-Field - # ---------------------------- - # Define the initial placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - for index_r in range(4): - for index_z in range(4): - Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] - if Sz[index_z]*Sr[index_r] < 0: - Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Ez_m0[iz[index_z], ir[index_r]] - else: - Fz_m += Sz[index_z]*Sr[index_r]* \ - Ez_m0[iz[index_z], ir[index_r]] - - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 1 - for index_r in range(4): - for index_z in range(4): - if Sz[index_z]*Sr[index_r] < 0: - Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Er_m1[iz[index_z], ir[index_r]] - Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Et_m1[iz[index_z], ir[index_r]] - else: - Fr_m += Sz[index_z]*Sr[index_r]* \ - Er_m1[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]* \ - Et_m1[iz[index_z], ir[index_r]] - Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] - - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Ex[i] = (cos*Fr - sin*Ft) - Ey[i] = (sin*Fr + cos*Ft) - Ez[i] = Fz - - # B-Field - # ---------------------------- - # Clear the placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - for index_r in range(4): - for index_z in range(4): - Fr_m += Sz[index_z]*Sr[index_r]* \ - Br_m0[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]* \ - Bt_m0[iz[index_z], ir[index_r]] - if Sz[index_z]*Sr[index_r] < 0: - Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Bz_m0[iz[index_z], ir[index_r]] - else: - Fz_m += Sz[index_z]*Sr[index_r]* \ - Bz_m0[iz[index_z], ir[index_r]] - - # Add the fields from the mode 0 - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - - # Add the fields for mode 1 - for index_r in range(4): - for index_z in range(4): - if Sz[index_z]*Sr[index_r] < 0: - Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Br_m1[iz[index_z], ir[index_r]] - Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Bt_m1[iz[index_z], ir[index_r]] - else: - Fr_m += Sz[index_z]*Sr[index_r]* \ - Br_m1[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]* \ - Bt_m1[iz[index_z], ir[index_r]] - Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] - - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Bx[i] = cos*Fr - sin*Ft - By[i] = sin*Fr + cos*Ft - Bz[i] = Fz - -# ----------------------------------------------------- -# Sorting utilities - get_cell_idx / sort / prefix_sum -# ----------------------------------------------------- - -@cuda.jit('void(int32[:], uint32[:], \ - float64[:], float64[:], float64[:], \ - float64, float64, int32, \ - float64, float64, int32)') -def get_cell_idx_per_particle(cell_idx, sorted_idx, - x, y, z, - invdz, zmin, Nz, - invdr, rmin, Nr): - """ - Get the cell index of each particle. - The cell index is 1d and calculated by: - cell index in z + cell index in r * number of cells in z. - The cell_idx of a particle is defined by - the lower cell in r and z, that it deposits its field to. - - Parameters - ---------- - cell_idx : 1darray of integers - The cell index of the particle - - sorted_idx : 1darray of integers - The sorted index array needs to be reset - before doing the sort - - x, y, z : 1darray of floats (in meters) - The position of the particles - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, in each direction - - Nz, Nr : int - Number of gridpoints along the considered direction - """ - i = cuda.grid(1) - if i < cell_idx.shape[0]: - # Preliminary arrays for the cylindrical conversion - xj = x[i] - yj = y[i] - zj = z[i] - rj = math.sqrt( xj**2 + yj**2 ) - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Original index of the uppper and lower cell - ir_lower = int(math.floor( r_cell )) - iz_lower = int(math.floor( z_cell )) - - # Treat the boundary conditions - # guard cells in lower r - if ir_lower < 0: - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr-1: - ir_lower = Nr-1 - # periodic boundaries in z - if iz_lower < 0: - iz_lower += Nz - if iz_lower > Nz-1: - iz_lower -= Nz - - # Reset sorted_idx array - sorted_idx[i] = i - # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr - cell_idx[i] = ir_lower + iz_lower * Nr - -def sort_particles_per_cell(cell_idx, sorted_idx): - """ - Sort the cell index of the particles and - modify the sorted index array accordingly. - - Parameters - ---------- - cell_idx : 1darray of integers - The cell index of the particle - - sorted_idx : 1darray of integers - Represents the original index of the - particle before the sorting. - """ - Ntot = cell_idx.shape[0] - if Ntot > 0: - sorter = sorting.RadixSort(Ntot, dtype = np.int32) - sorter.sort(cell_idx, vals = sorted_idx) - -@cuda.jit('void(int32[:], int32[:])') -def incl_prefix_sum(cell_idx, prefix_sum): - """ - Perform an inclusive parallel prefix sum on the sorted - cell index array. The prefix sum array represents the - cumulative sum of the number of particles per cell - for each cell index. - - Parameters - ---------- - cell_idx : 1darray of integers - The cell index of the particle - - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - # i is the index of the macroparticle - i = cuda.grid(1) - if i < cell_idx.shape[0]-1: - # ci: index of the cell of the present macroparticle - ci = cell_idx[i] - # ci_next: index of the cell of the next macroparticle - ci_next = cell_idx[i+1] - # Fill all the cells between ci and ci_next with the - # inclusive cumulative sum of the number particles until ci - while ci < ci_next: - # The cumulative sum of the number of particle per cell - # until ci is i+1 (since i obeys python index, starting at 0) - prefix_sum[ci] = i+1 - ci += 1 - # The last "macroparticle" of the cell_idx array fills up the - # rest of the prefix sum array - if i == cell_idx.shape[0]-1: - # Get the cell_index of the last macroparticle - ci = cell_idx[i] - # Fill all the remaining entries of the prefix sum array - for empty_index in range(ci, prefix_sum.shape[0]): - prefix_sum[empty_index] = i+1 - -@cuda.jit('void(int32[:])') -def reset_prefix_sum(prefix_sum): - """ - Resets the prefix sum. Sets all the values - to zero. - - Parameters - ---------- - prefix_sum : 1darray of integers - Represents the cumulative sum of - the particles per cell - """ - i = cuda.grid(1) - if i < prefix_sum.shape[0]: - prefix_sum[i] = 0 - -@cuda.jit('void(uint32[:], float64[:], float64[:])') -def write_sorting_buffer(sorted_idx, val, buf): - """ - Writes the values of a particle array to a buffer, - while rearranging them to match the sorted cell index array. - - Parameters - ---------- - sorted_idx : 1darray of integers - Represents the original index of the - particle before the sorting - - val : 1d array of floats - A particle data array - - buf : 1d array of floats - A buffer array to temporarily store the - sorted particle data array - """ - i = cuda.grid(1) - if i < val.shape[0]: - buf[i] = val[sorted_idx[i]] - -# ----------------------------------------------------- -# Device array creation utility (will be removed later) -# ----------------------------------------------------- - -def cuda_deposition_arrays(Nz = None, Nr = None, fieldtype = None): - """ - Create empty arrays on the GPU for the charge and - current deposition in each of the 4 possible direction. - - ########################################### - # Needs to be moved to the fields package! - ########################################### - - Parameters - ---------- - Nz : int - Number of cells in z. - Nr : int - Number of cells in r. - - fieldtype : string - Either 'rho' or 'J'. - """ - # Create empty arrays to store the four different possible - # cell directions a particle can deposit to. - if fieldtype == 'rho': - # Rho - third dimension represents 2 modes - rho0 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - rho1 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - rho2 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - rho3 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128) - return rho0, rho1, rho2, rho3 - - if fieldtype == 'J': - # J - third dimension represents 2 modes - # times 3 dimensions (r, t, z) - J0 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - J1 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - J2 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - J3 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128) - return J0, J1, J2, J3 diff --git a/fbpic/particles/numba_methods.py b/fbpic/particles/numba_methods.py deleted file mode 100644 index 17d94a84a..000000000 --- a/fbpic/particles/numba_methods.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the optimized particles methods that use numba on a CPU -""" -import numba -import math -from scipy.constants import c, e - -# ----------------------- -# Particle pusher utility -# ----------------------- - -@numba.jit(nopython=True) -def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): - """ - Advance the particles' positions over one half-timestep - - This assumes that the positions (x, y, z) are initially either - one half-timestep *behind* the momenta (ux, uy, uz), or at the - same timestep as the momenta. - """ - # Half timestep, multiplied by c - chdt = c*0.5*dt - - # Particle push - for ip in range(Ntot) : - x[ip] += chdt * inv_gamma[ip] * ux[ip] - y[ip] += chdt * inv_gamma[ip] * uy[ip] - z[ip] += chdt * inv_gamma[ip] * uz[ip] - -@numba.jit(nopython=True) -def push_p_numba( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : - """ - Advance the particles' momenta, using numba - """ - # Set a few constants - econst = q*dt/(m*c) - bconst = 0.5*q*dt/m - - # Loop over the particles - for ip in range(Ntot) : - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) - -@numba.jit(nopython=True) -def push_p_ioniz_numba( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : - """ - Advance the particles' momenta, using numba - """ - # Set a few constants - prefactor_econst = e*dt/(m*c) - prefactor_bconst = 0.5*e*dt/m - - # Loop over the particles - for ip in range(Ntot) : - - # For neutral macroparticles, skip this step - if ionization_level[ip] == 0: - continue - - # Calculate the charge dependent constants - econst = prefactor_econst * ionization_level[ip] - bconst = prefactor_bconst * ionization_level[ip] - # Perform the push - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], - econst, bconst ) - -@numba.jit(nopython=True) -def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, - Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): - """ - Push at single macroparticle, using the Vay pusher - """ - # Get the magnetic rotation vector - taux = bconst*Bx - tauy = bconst*By - tauz = bconst*Bz - tau2 = taux**2 + tauy**2 + tauz**2 - - # Get the momenta at the half timestep - uxp = ux_i + econst*Ex \ - + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) - uyp = uy_i + econst*Ey \ - + inv_gamma_i*( uz_i*taux - ux_i*tauz ) - uzp = uz_i + econst*Ez \ - + inv_gamma_i*( ux_i*tauy - uy_i*taux ) - sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 - utau = uxp*taux + uyp*tauy + uzp*tauz - - # Get the new 1./gamma - inv_gamma_f = math.sqrt( - 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) - - # Reuse the tau and utau variables to save memory - tx = inv_gamma_f*taux - ty = inv_gamma_f*tauy - tz = inv_gamma_f*tauz - ut = inv_gamma_f*utau - s = 1./( 1 + tau2*inv_gamma_f**2 ) - - # Get the new u - ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) - uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) - uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) - - return( ux_f, uy_f, uz_f, inv_gamma_f ) - -# ----------------------- -# Field gathering utility -# ----------------------- - -@numba.jit(nopython=True) -def gather_field_numba(exptheta, m, Fgrid, Fptcl, - iz, ir, Sz, Sr, sign_guards): - """ - Perform the weighted sum using numba - - Parameters - ---------- - exptheta : 1darray of complexs - (one element per macroparticle) - Contains exp(-im theta) for each macroparticle - - m : int - Index of the mode. - Determines wether a factor 2 should be applied - - Fgrid : 2darray of complexs - Contains the fields on the interpolation grid, - from which to do the gathering - - Fptcl : 1darray of floats - (one element per macroparticle) - Contains the fields for each macroparticle - Is modified by this function - - iz, ir : 2darray of ints - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the index of the cells that each macroparticle - will gather from. - - Sz, Sr: 2darray of floats - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the weight for respective cells from iz and ir, - for each macroparticle. - - sign_guards : float - The sign (+1 or -1) with which the weight of the guard cells should - be added to the 0th cell. - """ - # Get the total number of particles - Ntot = len(Fptcl) - - # Loop over the particles - for ip in range(Ntot): - # Erase the temporary variable - F = 0.j - # Loop over all the adjacent cells (given by shape order) - # Use helper variables `ir_corr` and `Sr_corr`. - # This is necessary, because ir and Sr should **not** be modified - # **in-place**. (This is because ir and Sr are reused several - # times, as we call the present function 3 times, with different - # values for sign_guards.) - for cell_index_r in range(ir.shape[0]): - for cell_index_z in range(iz.shape[0]): - # Correct the guard cell index and sign - if ir[cell_index_r, ip] < 0: - ir_corr = abs(ir[cell_index_r, ip]) - 1 - Sr_corr = sign_guards * Sr[cell_index_r, ip] - else: - ir_corr = ir[cell_index_r, ip] - Sr_corr = Sr[cell_index_r, ip] - # Gather the field value at the respective grid point - F += Sz[cell_index_z, ip] * Sr_corr * \ - Fgrid[ iz[cell_index_z, ip], ir_corr] - - # Add the complex phase - if m == 0: - Fptcl[ip] += (F * exptheta[ip]).real - if m > 0: - Fptcl[ip] += 2 * (F * exptheta[ip]).real - -# ------------------------- -# Charge deposition utility -# ------------------------- - -@numba.jit(nopython=True) -def deposit_field_numba(Fptcl, Fgrid, - iz, ir, Sz, Sr, sign_guards): - """ - Perform the deposition using numba - - Parameters - ---------- - Fptcl : 1darray of complexs - (one element per macroparticle) - Contains the charge or current for each macroparticle (already - multiplied by exp(im theta), from which to do the deposition - - Fgrid : 2darray of complexs - Contains the fields on the interpolation grid. - Is modified by this function - - iz, ir : 2darray of ints - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the index of the cells that each macroparticle - will deposit to. - - Sz, Sr: 2darray of floats - Arrays of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - Contains the weight for respective cells from iz and ir, - for each macroparticle. - - sign_guards : float - The sign (+1 or -1) with which the weight of the guard cells should - be added to the 0th cell. - """ - - # Get the total number of particles - Ntot = len(Fptcl) - - # Loop over all particles - for ip in range(Ntot): - # Loop over adjacent cells (given by shape order) - # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid - # modifying ir and Sr in place. (This is not strictly necessary, - # but is just here as a safeguard.) - for cell_index_r in range(ir.shape[0]): - for cell_index_z in range(iz.shape[0]): - # Correct the guard cell index and sign - if ir[cell_index_r, ip] < 0: - ir_corr = abs(ir[cell_index_r, ip]) - 1 - Sr_corr = sign_guards * Sr[cell_index_r, ip] - else: - ir_corr = ir[cell_index_r, ip] - Sr_corr = Sr[cell_index_r, ip] - # Deposit field from particle to the respective grid point - Fgrid[ iz[cell_index_z, ip], ir_corr ] += \ - Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip] diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index f9a15a945..0094e305e 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -9,28 +9,36 @@ from scipy.constants import c, e from .ionization import Ionizer from .tracking import ParticleTracker +import numba +import math # Load the utility methods -from .utility_methods import weights, unalign_angles -# Load the numba routines -from .numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba, \ - gather_field_numba, deposit_field_numba +from .utilities.utility_methods import weights, unalign_angles +# Load the numba methods +from .push.numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba +from .deposition.numba_methods import deposit_field_numba +from .gathering.numba_methods import gather_field_numba +# Load the numba CPU multi-threading methods +from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ + push_x_prange +from .deposition.threading_methods import deposit_rho_prange_linear, \ + deposit_J_prange_linear #CUBIC tbd +from .gathering.threading_methods import gather_field_prange_linear, \ + gather_field_prange_cubic # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed if cuda_installed: + # Load the CUDA methods from fbpic.cuda_utils import cuda, cuda_tpb_bpg_1d, cuda_tpb_bpg_2d - from .cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu, \ - gather_field_gpu_linear, gather_field_gpu_cubic, \ - write_sorting_buffer, cuda_deposition_arrays, \ + from .push.cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu + from .deposition.cuda_methods import deposit_rho_gpu_linear, \ + deposit_J_gpu_linear, deposit_rho_gpu_cubic, deposit_J_gpu_cubic + from .gathering.cuda_methods import gather_field_gpu_linear, \ + gather_field_gpu_cubic + from .utilities.cuda_sorting import write_sorting_buffer, \ get_cell_idx_per_particle, sort_particles_per_cell, \ reset_prefix_sum, incl_prefix_sum - from .cuda_deposition.cubic import deposit_rho_gpu_cubic, \ - deposit_J_gpu_cubic - from .cuda_deposition.linear import deposit_rho_gpu_linear, \ - deposit_J_gpu_linear - from .cuda_deposition.linear_non_atomic import deposit_rho_gpu, \ - deposit_J_gpu, add_rho, add_J class Particles(object) : """ @@ -50,7 +58,8 @@ def __init__(self, q, m, n, Npz, zmin, zmax, ux_m=0., uy_m=0., uz_m=0., ux_th=0., uy_th=0., uz_th=0., dens_func=None, continuous_injection=True, - use_cuda=False, grid_shape=None, particle_shape='linear' ) : + grid_shape=None, particle_shape='linear', + use_cuda=False, use_threading=True) : """ Initialize a uniform set of particles @@ -100,9 +109,6 @@ def dens_func( z, r ) ... Whether to continuously inject the particles, in the case of a moving window - use_cuda : bool, optional - Wether to use the GPU or not. - grid_shape: tuple, optional Needed when running on the GPU The shape of the local grid (including guard cells), i.e. @@ -111,10 +117,14 @@ def dens_func( z, r ) ... particle_shape: str, optional Set the particle shape for the charge/current deposition. - Possible values are 'cubic', 'linear' and 'linear_non_atomic'. - While 'cubic' corresponds to third order shapes and 'linear' - to first order shapes, 'linear_non_atomic' uses an equivalent - deposition scheme to 'linear' which avoids atomics on the GPU. + Possible values are 'linear' and 'cubic' for first and third + order particle shape factors. + + use_cuda : bool, optional + Wether to use the GPU or not. + + use_threading : bool, optional + Wether to use multi-threading on the CPU. """ # Register the timestep self.dt = dt @@ -198,6 +208,9 @@ def dens_func( z, r ) ... if dens_func is not None : self.w[:] = self.w * dens_func( self.z, r ) + # Register particle shape + self.particle_shape = particle_shape + # Allocate arrays and register variables when using CUDA if self.use_cuda: if grid_shape is None: @@ -213,9 +226,11 @@ def dens_func( z, r ) ... dtype=np.int32 ) # Register boolean that records if the particles are sorted or not self.sorted = False - - # Register particle shape - self.particle_shape = particle_shape + # Register variables when using multithreading + self.use_threading = use_threading + if self.use_threading == True: + # Register number of threads + self.nthreads = numba.config.NUMBA_NUM_THREADS def send_particles_to_gpu( self ): """ @@ -423,6 +438,7 @@ def push_p( self ) : half-timestep *behind* the positions (x, y, z), and it brings them one half-timestep *ahead* of the positions. """ + # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) @@ -441,7 +457,20 @@ def push_p( self ) : self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) - else : + # CPU multi-threading version + elif self.use_threading: + if self.ionizer is None: + push_p_prange(self.ux, self.uy, self.uz, self.inv_gamma, + self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, + self.q, self.m, self.Ntot, self.dt ) + else: + # Ionizable species can have a charge that depends on the + # macroparticle, and hence require a different function + push_p_ioniz_prange(self.ux, self.uy, self.uz, self.inv_gamma, + self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, + self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) + # CPU single-core version + else: if self.ionizer is None: push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, @@ -461,6 +490,7 @@ def halfpush_x( self ) : one half-timestep *behind* the momenta (ux, uy, uz), or at the same timestep as the momenta. """ + # GPU (CUDA) version if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) @@ -471,7 +501,13 @@ def halfpush_x( self ) : self.inv_gamma, self.dt ) # The particle array is unsorted after the push in x self.sorted = False - else : + # CPU multi-threading version + elif self.use_threading: + push_x_prange( self.x, self.y, self.z, + self.ux, self.uy, self.uz, + self.inv_gamma, self.Ntot, self.dt ) + # CPU single-core version + else: push_x_numba( self.x, self.y, self.z, self.ux, self.uy, self.uz, self.inv_gamma, self.Ntot, self.dt ) @@ -489,12 +525,24 @@ def gather( self, grid ) : (one InterpolationGrid object per azimuthal mode) Contains the field values on the interpolation grid """ - if self.use_cuda == True: + # GPU (CUDA) version + if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot ) # Call the CUDA Kernel for the gathering of E and B Fields # for Mode 0 and 1 only. - if self.particle_shape == 'cubic': + if self.particle_shape == 'linear': + gather_field_gpu_linear[dim_grid_1d, dim_block_1d]( + self.x, self.y, self.z, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + grid[0].Er, grid[0].Et, grid[0].Ez, + grid[1].Er, grid[1].Et, grid[1].Ez, + grid[0].Br, grid[0].Bt, grid[0].Bz, + grid[1].Br, grid[1].Bt, grid[1].Bz, + self.Ex, self.Ey, self.Ez, + self.Bx, self.By, self.Bz) + elif self.particle_shape == 'cubic': gather_field_gpu_cubic[dim_grid_1d, dim_block_1d]( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, @@ -506,7 +554,24 @@ def gather( self, grid ) : self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) else: - gather_field_gpu_linear[dim_grid_1d, dim_block_1d]( + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + # CPU multi-threading version + elif self.use_threading: + if self.particle_shape == 'linear': + gather_field_prange_linear( + self.x, self.y, self.z, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + grid[0].Er, grid[0].Et, grid[0].Ez, + grid[1].Er, grid[1].Et, grid[1].Ez, + grid[0].Br, grid[0].Bt, grid[0].Bz, + grid[1].Br, grid[1].Bt, grid[1].Bz, + self.Ex, self.Ey, self.Ez, + self.Bx, self.By, self.Bz) + elif self.particle_shape == 'cubic': + gather_field_prange_cubic( self.x, self.y, self.z, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, @@ -516,6 +581,11 @@ def gather( self, grid ) : grid[1].Br, grid[1].Bt, grid[1].Bz, self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz) + else: + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + # CPU single-core version else: # Preliminary arrays for the cylindrical conversion r = np.sqrt( self.x**2 + self.y**2 ) @@ -624,19 +694,13 @@ def deposit( self, fld, fieldtype ) : """ # Shortcut for the list of InterpolationGrid objects grid = fld.interp - - if self.use_cuda == True: + # GPU (CUDA) version + if self.use_cuda: # Get the threads per block and the blocks per grid dim_grid_2d_flat, dim_block_2d_flat = cuda_tpb_bpg_1d( grid[0].Nz*grid[0].Nr ) dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d( grid[0].Nz, grid[0].Nr ) - - # Create the helper arrays for deposition - if self.particle_shape == 'linear_non_atomic': - d_F0, d_F1, d_F2, d_F3 = cuda_deposition_arrays( - grid[0].Nz, grid[0].Nr, fieldtype=fieldtype) - # Sort the particles if self.sorted is False: self.sort_particles(fld=fld) @@ -648,17 +712,13 @@ def deposit( self, fld, fieldtype ) : # Rho if fieldtype == 'rho': # Deposit rho in each of four directions - if self.particle_shape == 'linear_non_atomic': - deposit_rho_gpu[dim_grid_2d_flat, dim_block_2d_flat]( + if self.particle_shape == 'linear': + deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, - d_F0, d_F1, d_F2, d_F3, - self.cell_idx, self.prefix_sum) - # Add the four directions together - add_rho[dim_grid_2d, dim_block_2d]( grid[0].rho, grid[1].rho, - d_F0, d_F1, d_F2, d_F3) + self.cell_idx, self.prefix_sum) elif self.particle_shape == 'cubic': deposit_rho_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, @@ -666,33 +726,23 @@ def deposit( self, fld, fieldtype ) : grid[0].invdr, grid[0].rmin, grid[0].Nr, grid[0].rho, grid[1].rho, self.cell_idx, self.prefix_sum) - elif self.particle_shape == 'linear': - deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( - self.x, self.y, self.z, self.w, - grid[0].invdz, grid[0].zmin, grid[0].Nz, - grid[0].invdr, grid[0].rmin, grid[0].Nr, - grid[0].rho, grid[1].rho, - self.cell_idx, self.prefix_sum) else: - raise ValueError("`particle_shape` should be either 'linear', 'linear_atomic' \ - or 'cubic' but is `%s`" % self.particle_shape) + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) # J elif fieldtype == 'J': # Deposit J in each of four directions - if self.particle_shape == 'linear_non_atomic': - deposit_J_gpu[dim_grid_2d_flat, dim_block_2d_flat]( + if self.particle_shape == 'linear': + deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, - d_F0, d_F1, d_F2, d_F3, - self.cell_idx, self.prefix_sum) - # Add the four directions together - add_J[dim_grid_2d, dim_block_2d]( grid[0].Jr, grid[1].Jr, grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, - d_F0, d_F1, d_F2, d_F3) + self.cell_idx, self.prefix_sum) elif self.particle_shape == 'cubic': deposit_J_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat]( self.x, self.y, self.z, self.w, @@ -703,26 +753,117 @@ def deposit( self, fld, fieldtype ) : grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, self.cell_idx, self.prefix_sum) - elif self.particle_shape == 'linear': - deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat]( + else: + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + else: + raise ValueError("`fieldtype` should be either 'J' or \ + 'rho', but is `%s`" % fieldtype) + # CPU multi-threading version + elif self.use_threading: + # Register particle chunk size for each thread + tx_N = int(self.Ntot/self.nthreads) + tx_chunks = [ tx_N for k in range(self.nthreads) ] + tx_chunks[-1] = tx_chunks[-1] + (tx_N)%(self.nthreads) + # Multithreading functions for the deposition of rho or J + # for Mode 0 and 1 only. + if fieldtype == 'rho': + # Generate temporary arrays for rho + rho_m0_global = np.zeros( + (grid[0].rho.shape[0], grid[0].rho.shape[1], self.nthreads), + dtype=grid[0].rho.dtype ) + rho_m1_global = np.zeros( + (grid[1].rho.shape[0], grid[1].rho.shape[1], self.nthreads), + dtype=grid[1].rho.dtype ) + # Deposit rho using CPU threading + if self.particle_shape == 'linear': + deposit_rho_prange_linear( + self.x, self.y, self.z, self.w, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + rho_m0_global, rho_m1_global, + grid[0].rho, grid[1].rho, + self.nthreads, tx_chunks, tx_N ) + elif self.particle_shape == 'cubic': + deposit_rho_prange_cubic( + self.x, self.y, self.z, self.w, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + rho_m0_global, rho_m1_global, + grid[0].rho, grid[1].rho, + self.nthreads, tx_chunks, tx_N ) + else: + raise ValueError("`particle_shape` should be either \ + 'linear' or 'cubic' \ + but is `%s`" % self.particle_shape) + # Sum thread-local results to main field array + grid[0].rho = np.sum(rho_m0_global, axis=2) + grid[1].rho = np.sum(rho_m1_global, axis=2) + + elif fieldtype == 'J': + # Generate temporary arrays for J + Jr_m0_global = np.zeros( + (grid[0].Jr.shape[0], grid[0].Jr.shape[1], self.nthreads), + dtype=grid[0].Jr.dtype ) + Jt_m0_global = np.zeros( + (grid[0].Jt.shape[0], grid[0].Jt.shape[1], self.nthreads), + dtype=grid[0].Jt.dtype ) + Jz_m0_global = np.zeros( + (grid[0].Jz.shape[0], grid[0].Jz.shape[1], self.nthreads), + dtype=grid[0].Jz.dtype ) + Jr_m1_global = np.zeros( + (grid[1].Jr.shape[0], grid[1].Jr.shape[1], self.nthreads), + dtype=grid[1].Jr.dtype ) + Jt_m1_global = np.zeros( + (grid[1].Jt.shape[0], grid[1].Jt.shape[1], self.nthreads), + dtype=grid[1].Jt.dtype ) + Jz_m1_global = np.zeros( + (grid[1].Jz.shape[0], grid[1].Jz.shape[1], self.nthreads), + dtype=grid[1].Jz.dtype ) + # Deposit J using CPU threading + if self.particle_shape == 'linear': + deposit_J_prange_linear( self.x, self.y, self.z, self.w, self.ux, self.uy, self.uz, self.inv_gamma, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, + Jr_m0_global, Jr_m1_global, + Jt_m0_global, Jt_m1_global, + Jz_m0_global, Jz_m1_global, grid[0].Jr, grid[1].Jr, grid[0].Jt, grid[1].Jt, grid[0].Jz, grid[1].Jz, - self.cell_idx, self.prefix_sum) + self.nthreads, tx_chunks, tx_N ) + elif self.particle_shape == 'cubic': + deposit_J_prange_cubic( + self.x, self.y, self.z, self.w, + self.ux, self.uy, self.uz, self.inv_gamma, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + Jr_m0_global, Jr_m1_global, + Jt_m0_global, Jt_m1_global, + Jz_m0_global, Jz_m1_global, + grid[0].Jr, grid[1].Jr, + grid[0].Jt, grid[1].Jt, + grid[0].Jz, grid[1].Jz, + self.nthreads, tx_chunks, tx_N ) else: raise ValueError("`particle_shape` should be either \ - 'linear', 'linear_atomic' or 'cubic' \ + 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) + # Sum thread-local results to main field array + grid[0].Jr = np.sum(Jr_m0_global, axis=2) + grid[0].Jt = np.sum(Jt_m0_global, axis=2) + grid[0].Jz = np.sum(Jz_m0_global, axis=2) + grid[1].Jr = np.sum(Jr_m1_global, axis=2) + grid[1].Jt = np.sum(Jt_m1_global, axis=2) + grid[1].Jz = np.sum(Jz_m1_global, axis=2) + else: raise ValueError("`fieldtype` should be either 'J' or \ 'rho', but is `%s`" % fieldtype) - - - # CPU version + # CPU single-core version else: # Preliminary arrays for the cylindrical conversion r = np.sqrt( self.x**2 + self.y**2 ) @@ -745,7 +886,7 @@ def deposit( self, fld, fieldtype ) : # number of elements in the grid list Nm = len(grid) - if fieldtype == 'rho' : + if fieldtype == 'rho': # --------------------------------------- # Deposit the charge density mode by mode # --------------------------------------- @@ -765,7 +906,7 @@ def deposit( self, fld, fieldtype ) : deposit_field_numba(self.w*exptheta, grid[m].rho, iz, ir, Sz, Sr, -1.) - elif fieldtype == 'J' : + elif fieldtype == 'J': # ---------------------------------------- # Deposit the current density mode by mode # ---------------------------------------- @@ -793,9 +934,9 @@ def deposit( self, fld, fieldtype ) : deposit_field_numba(Jz*exptheta, grid[m].Jz, iz, ir, Sz, Sr, -1.) - else : - raise ValueError( - "`fieldtype` should be either 'J' or 'rho', but is `%s`" %fieldtype ) + else: + raise ValueError("`fieldtype` should be either 'J' or \ + 'rho', but is `%s`" % fieldtype) def sort_particles(self, fld): """ diff --git a/fbpic/particles/utility_methods.py b/fbpic/particles/utility_methods.py deleted file mode 100644 index d0de6c5b8..000000000 --- a/fbpic/particles/utility_methods.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the optimized particles methods that use numba on a CPU -""" -import numpy as np - -# ----------------------- -# Particle shapes utility -# ----------------------- - -def weights(x, invdx, offset, Nx, direction, shape_order): - """ - Return the array of cell indices and corresponding shape factors - for current/charge deposition and field gathering - - Parameters: - ----------- - x : 1darray of floats (in meters) - Array of particle positions along a given direction - (one element per macroparticle) - - invdx : float (in meters^-1) - Inverse of the grid step along the considered direction - - offset : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nx : int - Number of gridpoints along the considered direction - - direction : string - Determines the boundary conditions. Either 'r' or 'z' - - shape_order : int - Order of the shape factor. - Either 1 or 3 - - Returns: - -------- - A tuple containing : - - i: 2darray of ints - An array of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - (i.e. the number of elements in the array x) - This array contains the indices of the grid cells - (along the axis specified by `direction`) where each macroparticle - deposits charge/current and gathers field data. - - S: 2darray of floats - An array of shape (shape_order+1, Ntot) - where Ntot is the number of macroparticles - (i.e. the number of elements in the array x) - This array contains the shape factors (a.k.a. interpolation weights) - that correspond to each of the indices in the array `i`. - """ - # Positions of the particles, in the cell unit - x_cell = invdx*(x - offset) - 0.5 - - # Initialize empty arrays of the correct size - i = np.empty( (shape_order+1, len(x)), dtype=np.int64) - S = np.empty( (shape_order+1, len(x)), dtype=np.float64) - - # Indices and shapes - if shape_order == 1: - i[0,:] = np.floor(x_cell).astype('int') - i[1,:] = i[0,:] + 1 - # Linear weight - S[0,:] = i[1,:] - x_cell - S[1,:] = x_cell - i[0,:] - elif shape_order == 3: - i[0,:] = np.floor(x_cell).astype('int') - 1 - i[1,:] = i[0,:] + 1 - i[2,:] = i[0,:] + 2 - i[3,:] = i[0,:] + 3 - # Cubic Weights - S[0,:] = -1./6. * ((x_cell-i[0])-2)**3 - S[1,:] = 1./6. * (3*((x_cell-i[1])**3) - 6*((x_cell-i[1])**2)+4) - S[2,:] = 1./6. * (3*((i[2]-x_cell)**3) - 6*((i[2]-x_cell)**2)+4) - S[3,:] = -1./6. * ((i[3]-x_cell)-2)**3 - else: - raise ValueError("shapes other than linear and cubic are not supported yet.") - - # Periodic boundary conditions in z - if direction == 'z': - # Lower Bound Periodic - i = np.where( i < 0, i+Nx, i ) - # Upper Bound Periodic - i = np.where( i > Nx-1, i-Nx, i ) - # Absorbing boundary condition at the upper r boundary - elif direction == 'r': - i = np.where( i > Nx-1, Nx-1, i ) - # Note: The lower bound index shift for r is done in the gather - # and deposit methods because the sign changes. - # This avoids using specific guard cells. - else: - raise ValueError("Unrecognized `direction` : %s" % direction) - - # Return the result - return( i, S ) - -# ---------------------------- -# Angle initialization utility -# ---------------------------- - -def unalign_angles( thetap, Npz, Npr, method='irrational' ) : - """ - Shift the angles so that the particles are - not all aligned along the arms of a star transversely - - The fact that the particles are all aligned can produce - numerical artefacts, especially if the polarization of the laser - is aligned with this direction. - - Here, for each position in r and z, we add the *same* - shift for all the Nptheta particles that are at this position. - (This preserves the fact that certain modes are 0 initially.) - How this shift varies from one position to another depends on - the method chosen. - - Parameters - ---------- - thetap : 3darray of floats - An array of shape (Npr, Npz, Nptheta) containing the angular - positions of the particles, and which is modified by this function. - - Npz, Npr : ints - The number of macroparticles along the z and r directions - - method : string - Either 'random' or 'irrational' - """ - # Determine the angle shift - if method == 'random' : - angle_shift = 2*np.pi*np.random.rand(Npz, Npr) - elif method == 'irrational' : - # Subrandom sequence, by adding irrational number (sqrt(2) and sqrt(3)) - # This ensures that the sequence does not wrap around and induce - # correlations - shiftr = np.sqrt(2)*np.arange(Npr) - shiftz = np.sqrt(3)*np.arange(Npz) - angle_shift = 2*np.pi*( shiftz[:,np.newaxis] + shiftr[np.newaxis,:] ) - angle_shift = np.mod( angle_shift, 2*np.pi ) - else : - raise ValueError( - "method must be either 'random' or 'irrational' but is %s" %method ) - - # Add the angle shift to thetap - # np.newaxis ensures that the angles that are at the same positions - # in r and z have the same shift - thetap[:,:,:] = thetap[:,:,:] + angle_shift[:,:, np.newaxis] From 497c55a8ef8d3a1ce076974dea60c2757aaa07f1 Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Fri, 14 Jul 2017 18:53:15 +0200 Subject: [PATCH 02/36] Initial CPU multi-threading implementation (part 2) --- fbpic/particles/deposition/__init__.py | 0 fbpic/particles/deposition/cuda_methods.py | 1683 +++++++++++++++++ fbpic/particles/deposition/numba_methods.py | 66 + .../particles/deposition/threading_methods.py | 454 +++++ fbpic/particles/gathering/__init__.py | 0 fbpic/particles/gathering/cuda_methods.py | 599 ++++++ fbpic/particles/gathering/numba_methods.py | 82 + .../particles/gathering/threading_methods.py | 580 ++++++ fbpic/particles/push/__init__.py | 0 fbpic/particles/push/cuda_methods.py | 191 ++ fbpic/particles/push/numba_methods.py | 110 ++ fbpic/particles/push/threading_methods.py | 114 ++ fbpic/particles/utilities/__init__.py | 0 fbpic/particles/utilities/cuda_sorting.py | 182 ++ fbpic/particles/utilities/utility_methods.py | 155 ++ 15 files changed, 4216 insertions(+) create mode 100644 fbpic/particles/deposition/__init__.py create mode 100644 fbpic/particles/deposition/cuda_methods.py create mode 100644 fbpic/particles/deposition/numba_methods.py create mode 100644 fbpic/particles/deposition/threading_methods.py create mode 100644 fbpic/particles/gathering/__init__.py create mode 100644 fbpic/particles/gathering/cuda_methods.py create mode 100644 fbpic/particles/gathering/numba_methods.py create mode 100644 fbpic/particles/gathering/threading_methods.py create mode 100644 fbpic/particles/push/__init__.py create mode 100644 fbpic/particles/push/cuda_methods.py create mode 100644 fbpic/particles/push/numba_methods.py create mode 100644 fbpic/particles/push/threading_methods.py create mode 100644 fbpic/particles/utilities/__init__.py create mode 100644 fbpic/particles/utilities/cuda_sorting.py create mode 100644 fbpic/particles/utilities/utility_methods.py diff --git a/fbpic/particles/deposition/__init__.py b/fbpic/particles/deposition/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/deposition/cuda_methods.py b/fbpic/particles/deposition/cuda_methods.py new file mode 100644 index 000000000..2686479bf --- /dev/null +++ b/fbpic/particles/deposition/cuda_methods.py @@ -0,0 +1,1683 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the GPU using CUDA. +""" +from numba import cuda, int64 +import math +from scipy.constants import c +import numpy as np + +# ------------------------------- +# Particle shape Factor functions +# ------------------------------- + +# Linear shapes +@cuda.jit(device=True, inline=True) +def z_shape_linear(cell_position, index): + iz = int64(math.floor(cell_position)) + if index == 0: + return iz+1.-cell_position + if index == 1: + return cell_position - iz + +@cuda.jit(device=True, inline=True) +def r_shape_linear(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(ir+1.-cell_position) + if index == 1: + return flip_factor*(cell_position - ir) + +# Cubic shapes +@cuda.jit(device=True, inline=True) +def z_shape_cubic(cell_position, index): + iz = int64(math.floor(cell_position)) - 1 + if index == 0: + return (-1./6.)*((cell_position-iz)-2)**3 + if index == 1: + return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) + if index == 2: + return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) + if index == 3: + return (-1./6.)*(((iz+3)-cell_position)-2)**3 + +@cuda.jit(device=True, inline=True) +def r_shape_cubic(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) - 1 + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 + if index == 1: + if ir+1 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) + if index == 2: + if ir+2 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) + if index == 3: + if ir+3 < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 + +# ------------------------------- +# Field deposition - linear - rho +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + int32[:], int32[:])') +def deposit_rho_gpu_linear(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0, rho_m1, + cell_idx, prefix_sum): + """ + Deposition of the charge density rho using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of rho that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 4 variables (one for each possible direction, + e.g. upper in z, lower in r) to maintain parallelism while + avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0, rho_m1 : 2darrays of complexs + The charge density on the interpolation grid for + mode 0 and 1. (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i / Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare local field arrays + R_m0_00 = 0. + R_m0_01 = 0. + R_m0_10 = 0. + R_m0_11 = 0. + + R_m1_00 = 0. + 0.j + R_m1_01 = 0. + 0.j + R_m1_10 = 0. + 0.j + R_m1_11 = 0. + 0.j + + for j in range(frequency_per_cell): + # Get the particle index before the sorting + # -------------------------------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Boundary Region Shifts + ir_lower = int64(math.floor(r_cell)) + + R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal + + if ir_lower == -1: + R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + else: + R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if ir_cell+1 > (Nr-1): + shift_r = -1 + if iz_cell+1 > Nz-1: + shift_z -= Nz + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real) + cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real) + cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real) + cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real) + cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag) + + +# ------------------------------- +# Field deposition - linear - J +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:],\ + int32[:], int32[:])') +def deposit_J_gpu_linear(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0, j_r_m1, + j_t_m0, j_t_m1, + j_z_m0, j_z_m1, + cell_idx, prefix_sum): + """ + Deposition of the current J using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of J that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 4 variables (one for each possible direction, + e.g. upper in z, lower in r) to maintain parallelism while + avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs + The current component in each direction (r, t, z) + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i/Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + + J_r_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m0_00 = 0.# + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m0_00 = 0. + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m0_01 = 0. + J_t_m1_01 = 0. + 0.j + J_z_m0_01 = 0. + J_z_m1_01 = 0. + 0.j + + J_r_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m0_10 = 0. + J_t_m1_10 = 0. + 0.j + J_z_m0_10 = 0. + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m0_11 = 0. + J_t_m1_11 = 0. + 0.j + J_z_m0_11 = 0. + J_z_m1_11 = 0. + 0.j + + + # Loop over the number of particles per cell + for j in range(frequency_per_cell): + # Get the particle index + # ---------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Deposit on local copies at respective position + ir_lower = int64(math.floor(r_cell)) + + J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Take into account lower r flips + if ir_lower == -1: + J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + else: + J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if (ir_cell+1) > (Nr-1): + shift_r = -1 + if (iz_cell+1) > Nz-1: + shift_z -= Nz + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real) + cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real) + cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real) + cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real) + cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real) + cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real) + cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag) + +# ------------------------------- +# Field deposition - cubic - rho +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + int32[:], int32[:])') +def deposit_rho_gpu_cubic(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0, rho_m1, + cell_idx, prefix_sum): + """ + Deposition of the charge density rho using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of rho that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 16 variables (one for each surrounding cell) to + maintain parallelism while avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0, rho_m1 : 2darrays of complexs + The charge density on the interpolation grid for + mode 0 and 1. (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i / Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare local field arrays + R_m0_00 = 0. + R_m1_00 = 0. + 0.j + + R_m0_01 = 0. + R_m1_01 = 0. + 0.j + + R_m0_02 = 0. + R_m1_02 = 0. + 0.j + + R_m0_03 = 0. + R_m1_03 = 0. + 0.j + + R_m0_10 = 0. + R_m1_10 = 0. + 0.j + + R_m0_11 = 0. + R_m1_11 = 0. + 0.j + + R_m0_12 = 0. + R_m1_12 = 0. + 0.j + + R_m0_13 = 0. + R_m1_13 = 0. + 0.j + + R_m0_20 = 0. + R_m1_20 = 0. + 0.j + + R_m0_21 = 0. + R_m1_21 = 0. + 0.j + + R_m0_22 = 0. + R_m1_22 = 0. + 0.j + + R_m0_23 = 0. + R_m1_23 = 0. + 0.j + + R_m0_30 = 0. + R_m1_30 = 0. + 0.j + + R_m0_31 = 0. + R_m1_31 = 0. + 0.j + + R_m0_32 = 0. + R_m1_32 = 0. + 0.j + + R_m0_33 = 0. + R_m1_33 = 0. + 0.j + + for j in range(frequency_per_cell): + # Get the particle index before the sorting + # -------------------------------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + # Compute values in local copies and consider boundaries + ir0 = int64(math.floor(r_cell)) - 1 + + if (ir0 == -2): + R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + if (ir0 == -1): + R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + if (ir0 >= 0): + R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real) + cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real) + cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real) + cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real) + cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag) + + cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real) + cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real) + cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag) + + +# ------------------------------- +# Field deposition - cubic - J +# ------------------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:],\ + int32[:], int32[:])') +def deposit_J_gpu_cubic(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0, j_r_m1, + j_t_m0, j_t_m1, + j_z_m0, j_z_m1, + cell_idx, prefix_sum): + """ + Deposition of the current J using numba on the GPU. + Iterates over the cells and over the particles per cell. + Calculates the weighted amount of J that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The particles are sorted by their cell index (the lower cell + in r and z that they deposit to) and the deposited field + is split into 16 variables (one for each cell) to maintain + parallelism while avoiding any race conditions. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs + The current component in each direction (r, t, z) + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel (for threads < number of cells) + if i < prefix_sum.shape[0]: + # Calculate the cell index in 2D from the 1D threadIdx + iz_cell = int(i/Nr) + ir_cell = int(i - iz_cell * Nr) + # Calculate the inclusive offset for the current cell + # It represents the number of particles contained in all other cells + # with an index smaller than i + the total number of particles in the + # current cell (inclusive). + incl_offset = np.int32(prefix_sum[i]) + # Calculate the frequency per cell from the offset and the previous + # offset (prefix_sum[i-1]). + if i > 0: + frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1]) + if i == 0: + frequency_per_cell = np.int32(incl_offset) + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + J_r_m0_00 = 0. + J_t_m0_00 = 0. + J_z_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_t_m0_01 = 0. + J_z_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m1_01 = 0. + 0.j + J_z_m1_01 = 0. + 0.j + + J_r_m0_02 = 0. + J_t_m0_02 = 0. + J_z_m0_02 = 0. + J_r_m1_02 = 0. + 0.j + J_t_m1_02 = 0. + 0.j + J_z_m1_02 = 0. + 0.j + + J_r_m0_03 = 0. + J_t_m0_03 = 0. + J_z_m0_03 = 0. + J_r_m1_03 = 0. + 0.j + J_t_m1_03 = 0. + 0.j + J_z_m1_03 = 0. + 0.j + + J_r_m0_10 = 0. + J_t_m0_10 = 0. + J_z_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m1_10 = 0. + 0.j + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_t_m0_11 = 0. + J_z_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m1_11 = 0. + 0.j + J_z_m1_11 = 0. + 0.j + + J_r_m0_12 = 0. + J_t_m0_12 = 0. + J_z_m0_12 = 0. + J_r_m1_12 = 0. + 0.j + J_t_m1_12 = 0. + 0.j + J_z_m1_12 = 0. + 0.j + + J_r_m0_13 = 0. + J_t_m0_13 = 0. + J_z_m0_13 = 0. + J_r_m1_13 = 0. + 0.j + J_t_m1_13 = 0. + 0.j + J_z_m1_13 = 0. + 0.j + + J_r_m0_20 = 0. + J_t_m0_20 = 0. + J_z_m0_20 = 0. + J_r_m1_20 = 0. + 0.j + J_t_m1_20 = 0. + 0.j + J_z_m1_20 = 0. + 0.j + + J_r_m0_21 = 0. + J_t_m0_21 = 0. + J_z_m0_21 = 0. + J_r_m1_21 = 0. + 0.j + J_t_m1_21 = 0. + 0.j + J_z_m1_21 = 0. + 0.j + + J_r_m0_22 = 0. + J_t_m0_22 = 0. + J_z_m0_22 = 0. + J_r_m1_22 = 0. + 0.j + J_t_m1_22 = 0. + 0.j + J_z_m1_22 = 0. + 0.j + + J_r_m0_23 = 0. + J_t_m0_23 = 0. + J_z_m0_23 = 0. + J_r_m1_23 = 0. + 0.j + J_t_m1_23 = 0. + 0.j + J_z_m1_23 = 0. + 0.j + + J_r_m0_30 = 0. + J_t_m0_30 = 0. + J_z_m0_30 = 0. + J_r_m1_30 = 0. + 0.j + J_t_m1_30 = 0. + 0.j + J_z_m1_30 = 0. + 0.j + + J_r_m0_31 = 0. + J_t_m0_31 = 0. + J_z_m0_31 = 0. + J_r_m1_31 = 0. + 0.j + J_t_m1_31 = 0. + 0.j + J_z_m1_31 = 0. + 0.j + + J_r_m0_32 = 0. + J_t_m0_32 = 0. + J_z_m0_32 = 0. + J_r_m1_32 = 0. + 0.j + J_t_m1_32 = 0. + 0.j + J_z_m1_32 = 0. + 0.j + + J_r_m0_33 = 0. + J_t_m0_33 = 0. + J_z_m0_33 = 0. + J_r_m1_33 = 0. + 0.j + J_t_m1_33 = 0. + 0.j + J_z_m1_33 = 0. + 0.j + + # Loop over the number of particles per cell + for j in range(frequency_per_cell): + # Get the particle index + # ---------------------- + # (Since incl_offset is a cumulative sum of particle number, + # and since python index starts at 0, one has to add -1) + ptcl_idx = incl_offset-1-j + + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Compute values in local copies and consider boundaries + ir0 = int64(math.floor(r_cell)) - 1 + + if (ir0 == -2): + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + + J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir0 == -1): + J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir0 >= 0): + J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Atomically add the registers to global memory + if frequency_per_cell > 0: + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30) + cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31) + cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32) + cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag) + + cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33) + cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real) + cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30) + cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31) + cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32) + cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag) + + cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33) + cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real) + cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag) + + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30) + cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31) + cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32) + cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag) + + cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33) + cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real) + cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag) diff --git a/fbpic/particles/deposition/numba_methods.py b/fbpic/particles/deposition/numba_methods.py new file mode 100644 index 000000000..4ad61f9f9 --- /dev/null +++ b/fbpic/particles/deposition/numba_methods.py @@ -0,0 +1,66 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the CPU with numba. +""" +import numba +from scipy.constants import c, e + +@numba.njit +def deposit_field_numba(Fptcl, Fgrid, + iz, ir, Sz, Sr, sign_guards): + """ + Perform the deposition using numba + + Parameters + ---------- + Fptcl : 1darray of complexs + (one element per macroparticle) + Contains the charge or current for each macroparticle (already + multiplied by exp(im theta), from which to do the deposition + + Fgrid : 2darray of complexs + Contains the fields on the interpolation grid. + Is modified by this function + + iz, ir : 2darray of ints + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the index of the cells that each macroparticle + will deposit to. + + Sz, Sr: 2darray of floats + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the weight for respective cells from iz and ir, + for each macroparticle. + + sign_guards : float + The sign (+1 or -1) with which the weight of the guard cells should + be added to the 0th cell. + """ + + # Get the total number of particles + Ntot = len(Fptcl) + + # Loop over all particles + for ip in range(Ntot): + # Loop over adjacent cells (given by shape order) + # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid + # modifying ir and Sr in place. (This is not strictly necessary, + # but is just here as a safeguard.) + for cell_index_r in range(ir.shape[0]): + for cell_index_z in range(iz.shape[0]): + # Correct the guard cell index and sign + if ir[cell_index_r, ip] < 0: + ir_corr = abs(ir[cell_index_r, ip]) - 1 + Sr_corr = sign_guards * Sr[cell_index_r, ip] + else: + ir_corr = ir[cell_index_r, ip] + Sr_corr = Sr[cell_index_r, ip] + # Deposit field from particle to the respective grid point + Fgrid[ iz[cell_index_z, ip], ir_corr ] += \ + Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip] diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py new file mode 100644 index 000000000..595d6bdb5 --- /dev/null +++ b/fbpic/particles/deposition/threading_methods.py @@ -0,0 +1,454 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the CPU with threading. +""" +import numba +from numba import prange, int64 +import math +from scipy.constants import c +import numpy as np + +# ------------------------------- +# Particle shape Factor functions +# ------------------------------- + +# Linear shapes +@numba.njit +def z_shape_linear(cell_position, index): + iz = int64(math.floor(cell_position)) + if index == 0: + return iz+1.-cell_position + if index == 1: + return cell_position - iz + +@numba.njit +def r_shape_linear(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(ir+1.-cell_position) + if index == 1: + return flip_factor*(cell_position - ir) + +# Cubic shapes +@numba.njit +def z_shape_cubic(cell_position, index): + iz = int64(math.floor(cell_position)) - 1 + if index == 0: + return (-1./6.)*((cell_position-iz)-2)**3 + if index == 1: + return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) + if index == 2: + return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) + if index == 3: + return (-1./6.)*(((iz+3)-cell_position)-2)**3 + +@numba.njit +def r_shape_cubic(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) - 1 + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 + if index == 1: + if ir+1 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) + if index == 2: + if ir+2 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) + if index == 3: + if ir+3 < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 + +# ------------------------------- +# Field deposition - linear - rho +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_rho_prange_linear(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0_global, rho_m1_global, + rho_m0, rho_m1, + nthreads, tx_chunks, tx_N): + """ + Deposition of the charge density rho using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + + Calculates the weighted amount of rho that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0, rho_m1 : 2darrays of complexs + The charge density on the interpolation grid for + mode 0 and 1. (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Create thread_local helper arrays + # FIXME! ( instead of using zeros_like, + # it would be nicer to use np.zeros((Nz,Nr)) ) + rho_m0_thread = np.zeros_like( rho_m0 ) + rho_m1_thread = np.zeros_like( rho_m1 ) + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_N + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Boundary Region Shifts + ir_flip = int( math.floor(r_cell) ) + + R_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal + + if ir_flip == -1: + R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + else: + R_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if ir_cell+1 > (Nr-1): + shift_r = -1 + if iz_cell+1 > Nz-1: + shift_z -= Nz + # Write to thread local arrays + rho_m0_thread[iz_cell, ir_cell] += R_m0_00 + rho_m1_thread[iz_cell, ir_cell] += R_m1_00 + + rho_m0_thread[iz_cell+1 + shift_z, ir_cell] += R_m0_01 + rho_m1_thread[iz_cell+1 + shift_z, ir_cell] += R_m1_01 + + rho_m0_thread[iz_cell, ir_cell+1 + shift_r] += R_m0_10 + rho_m1_thread[iz_cell, ir_cell+1 + shift_r] += R_m1_10 + + rho_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 + rho_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 + + # Write thread local deposition arrays to global deposition arrays + rho_m0_global[:,:,tx] = rho_m0_thread + rho_m1_global[:,:,tx] = rho_m1_thread + + return + +# ------------------------------- +# Field deposition - linear - J +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_J_prange_linear(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0_global, j_r_m1_global, + j_t_m0_global, j_t_m1_global, + j_z_m0_global, j_z_m1_global, + j_r_m0, j_r_m1, + j_t_m0, j_t_m1, + j_z_m0, j_z_m1, + nthreads, tx_chunks, tx_N): + """ + Deposition of the current density J using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + + Calculates the weighted amount of J that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs + The current component in each direction (r, t, z) + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Create thread_local helper arrays + # FIXME! ( instead of using zeros_like, + # it would be nicer to use np.zeros((Nz,Nr)) ) + j_r_m0_thread = np.zeros_like( j_r_m0 ) + j_t_m0_thread = np.zeros_like( j_t_m0 ) + j_z_m0_thread = np.zeros_like( j_z_m0 ) + j_r_m1_thread = np.zeros_like( j_r_m1 ) + j_t_m1_thread = np.zeros_like( j_t_m1 ) + j_z_m1_thread = np.zeros_like( j_z_m1 ) + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_N + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Boundary Region Shifts + ir_flip = int( math.floor(r_cell) ) + + J_r_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Take into account lower r flips + if ir_flip == -1: + J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + else: + J_r_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if (ir_cell+1) > (Nr-1): + shift_r = -1 + if (iz_cell+1) > Nz-1: + shift_z -= Nz + + j_r_m0_thread[iz_cell, ir_cell] += J_r_m0_00 + j_r_m1_thread[iz_cell, ir_cell] += J_r_m1_00 + + j_r_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 + j_r_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 + + j_r_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 + j_r_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 + + j_r_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 + j_r_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 + + j_t_m0_thread[iz_cell, ir_cell] += J_t_m0_00 + j_t_m1_thread[iz_cell, ir_cell] += J_t_m1_00 + + j_t_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 + j_t_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 + + j_t_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 + j_t_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 + + j_t_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 + j_t_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 + + j_z_m0_thread[iz_cell, ir_cell] += J_z_m0_00 + j_z_m1_thread[iz_cell, ir_cell] += J_z_m1_00 + + j_z_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 + j_z_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 + + j_z_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 + j_z_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 + + j_z_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 + j_z_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 + + # Write thread local deposition arrays to global deposition arrays + j_r_m0_global[:,:,tx] = j_r_m0_thread + j_t_m0_global[:,:,tx] = j_t_m0_thread + j_z_m0_global[:,:,tx] = j_z_m0_thread + j_r_m1_global[:,:,tx] = j_r_m1_thread + j_t_m1_global[:,:,tx] = j_t_m1_thread + j_z_m1_global[:,:,tx] = j_z_m1_thread + + return \ No newline at end of file diff --git a/fbpic/particles/gathering/__init__.py b/fbpic/particles/gathering/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/gathering/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py new file mode 100644 index 000000000..6bd5a6c0c --- /dev/null +++ b/fbpic/particles/gathering/cuda_methods.py @@ -0,0 +1,599 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the field gathering methods linear and cubic order shapes +on the GPU using CUDA. +""" +from numba import cuda, float64, int64 +import math +from scipy.constants import c, e +import numpy as np + +# ----------------------- +# Field gathering linear +# ----------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:])') +def gather_field_gpu_linear(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba on the GPU. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (linear). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel + # (for threads < number of particles) + if i < x.shape[0]: + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt( xj**2 + yj**2 ) + if (rj !=0. ) : + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else : + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get linear weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + # Original index of the uppper and lower cell + ir_lower = int(math.floor( r_cell )) + ir_upper = ir_lower + 1 + iz_lower = int(math.floor( z_cell )) + iz_upper = iz_lower + 1 + # Linear weight + Sr_lower = ir_upper - r_cell + Sr_upper = r_cell - ir_lower + Sz_lower = iz_upper - z_cell + Sz_upper = z_cell - iz_lower + # Set guard weights to zero + Sr_guard = 0. + + # Treat the boundary conditions + # -------------------------------------------- + # guard cells in lower r + if ir_lower < 0: + Sr_guard = Sr_lower + Sr_lower = 0. + ir_lower = 0 + # absorbing in upper r + if ir_lower > Nr-1: + ir_lower = Nr-1 + if ir_upper > Nr-1: + ir_upper = Nr-1 + # periodic boundaries in z + # lower z boundaries + if iz_lower < 0: + iz_lower += Nz + if iz_upper < 0: + iz_upper += Nz + # upper z boundaries + if iz_lower > Nz-1: + iz_lower -= Nz + if iz_upper > Nz-1: + iz_upper -= Nz + + #Precalculate Shapes + S_ll = Sz_lower*Sr_lower + S_lu = Sz_lower*Sr_upper + S_ul = Sz_upper*Sr_lower + S_uu = Sz_upper*Sr_upper + S_lg = Sz_lower*Sr_guard + S_ug = Sz_upper*Sr_guard + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Ez_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Ez_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Er_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Et_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Er_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Et_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = cos*Fr - sin*Ft + Ey[i] = sin*Fr + cos*Ft + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Bz_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Bz_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ] + + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Br_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Bt_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Br_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Bt_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + +# ----------------------- +# Field gathering cubic +# ----------------------- + +@cuda.jit('void(float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32, \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + complex128[:,:], complex128[:,:], complex128[:,:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:])') +def gather_field_gpu_cubic(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba on the GPU. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (cubic). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + + # Get the 1D CUDA grid + i = cuda.grid(1) + # Deposit the field per cell in parallel + # (for threads < number of particles) + if i < x.shape[0]: + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particle, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the shape factors + Sr = cuda.local.array((4,), dtype=float64) + ir = cuda.local.array((4,), dtype=int64) + ir[0] = int64(math.floor(r_cell)) - 1 + ir[1] = ir[0] + 1 + ir[2] = ir[1] + 1 + ir[3] = ir[2] + 1 + Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3 + Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) + Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) + Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 + iz = cuda.local.array((4,), dtype=int64) + Sz = cuda.local.array((4,), dtype=float64) + iz[0] = int64(math.floor(z_cell)) - 1 + iz[1] = iz[0] + 1 + iz[2] = iz[1] + 1 + iz[3] = iz[2] + 1 + Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3 + Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4) + Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) + Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 + # Lower and upper periodic boundary for z + for index_z in range(4): + if iz[index_z] < 0: + iz[index_z] += Nz + if iz[index_z] > Nz - 1: + iz[index_z] -= Nz + # Lower and upper boundary for r + for index_r in range(4): + if ir[index_r] < 0: + ir[index_r] = abs(ir[index_r])-1 + Sr[index_r] = (-1.)*Sr[index_r] + if ir[index_r] > Nr - 1: + ir[index_r] = Nr - 1 + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + for index_r in range(4): + for index_z in range(4): + Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + for index_r in range(4): + for index_z in range(4): + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = (cos*Fr - sin*Ft) + Ey[i] = (sin*Fr + cos*Ft) + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + for index_r in range(4): + for index_z in range(4): + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + + # Add the fields for mode 1 + for index_r in range(4): + for index_z in range(4): + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz \ No newline at end of file diff --git a/fbpic/particles/gathering/numba_methods.py b/fbpic/particles/gathering/numba_methods.py new file mode 100644 index 000000000..f85a79ea0 --- /dev/null +++ b/fbpic/particles/gathering/numba_methods.py @@ -0,0 +1,82 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the field gathering methods linear and cubic order shapes +on the CPU with numba. +""" +import numba + +@numba.jit(nopython=True) +def gather_field_numba(exptheta, m, Fgrid, Fptcl, + iz, ir, Sz, Sr, sign_guards): + """ + Perform the weighted sum using numba + + Parameters + ---------- + exptheta : 1darray of complexs + (one element per macroparticle) + Contains exp(-im theta) for each macroparticle + + m : int + Index of the mode. + Determines wether a factor 2 should be applied + + Fgrid : 2darray of complexs + Contains the fields on the interpolation grid, + from which to do the gathering + + Fptcl : 1darray of floats + (one element per macroparticle) + Contains the fields for each macroparticle + Is modified by this function + + iz, ir : 2darray of ints + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the index of the cells that each macroparticle + will gather from. + + Sz, Sr: 2darray of floats + Arrays of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + Contains the weight for respective cells from iz and ir, + for each macroparticle. + + sign_guards : float + The sign (+1 or -1) with which the weight of the guard cells should + be added to the 0th cell. + """ + # Get the total number of particles + Ntot = len(Fptcl) + + # Loop over the particles + for ip in range(Ntot): + # Erase the temporary variable + F = 0.j + # Loop over all the adjacent cells (given by shape order) + # Use helper variables `ir_corr` and `Sr_corr`. + # This is necessary, because ir and Sr should **not** be modified + # **in-place**. (This is because ir and Sr are reused several + # times, as we call the present function 3 times, with different + # values for sign_guards.) + for cell_index_r in range(ir.shape[0]): + for cell_index_z in range(iz.shape[0]): + # Correct the guard cell index and sign + if ir[cell_index_r, ip] < 0: + ir_corr = abs(ir[cell_index_r, ip]) - 1 + Sr_corr = sign_guards * Sr[cell_index_r, ip] + else: + ir_corr = ir[cell_index_r, ip] + Sr_corr = Sr[cell_index_r, ip] + # Gather the field value at the respective grid point + F += Sz[cell_index_z, ip] * Sr_corr * \ + Fgrid[ iz[cell_index_z, ip], ir_corr] + + # Add the complex phase + if m == 0: + Fptcl[ip] += (F * exptheta[ip]).real + if m > 0: + Fptcl[ip] += 2 * (F * exptheta[ip]).real \ No newline at end of file diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py new file mode 100644 index 000000000..73da99cfe --- /dev/null +++ b/fbpic/particles/gathering/threading_methods.py @@ -0,0 +1,580 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the field gathering methods linear and cubic order shapes +on the CPU with threading +""" +from numba import prange, int64 +import numba +import math + +# ----------------------- +# Field gathering linear +# ----------------------- + +@numba.njit(parallel=True) +def gather_field_prange_linear(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba with multi-threading. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (linear). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Deposit the field per cell in parallel + for i in prange(x.shape[0]): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt( xj**2 + yj**2 ) + if (rj !=0. ) : + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else : + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get linear weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + # Original index of the uppper and lower cell + ir_lower = int(math.floor( r_cell )) + ir_upper = ir_lower + 1 + iz_lower = int(math.floor( z_cell )) + iz_upper = iz_lower + 1 + # Linear weight + Sr_lower = ir_upper - r_cell + Sr_upper = r_cell - ir_lower + Sz_lower = iz_upper - z_cell + Sz_upper = z_cell - iz_lower + # Set guard weights to zero + Sr_guard = 0. + + # Treat the boundary conditions + # -------------------------------------------- + # guard cells in lower r + if ir_lower < 0: + Sr_guard = Sr_lower + Sr_lower = 0. + ir_lower = 0 + # absorbing in upper r + if ir_lower > Nr-1: + ir_lower = Nr-1 + if ir_upper > Nr-1: + ir_upper = Nr-1 + # periodic boundaries in z + # lower z boundaries + if iz_lower < 0: + iz_lower += Nz + if iz_upper < 0: + iz_upper += Nz + # upper z boundaries + if iz_lower > Nz-1: + iz_lower -= Nz + if iz_upper > Nz-1: + iz_upper -= Nz + + #Precalculate Shapes + S_ll = Sz_lower*Sr_lower + S_lu = Sz_lower*Sr_upper + S_ul = Sz_upper*Sr_lower + S_uu = Sz_upper*Sr_upper + S_lg = Sz_lower*Sr_guard + S_ug = Sz_upper*Sr_guard + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Ez_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Ez_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Er_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Et_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Er_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Et_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = cos*Fr - sin*Ft + Ey[i] = sin*Fr + cos*Ft + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Bz_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Bz_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ] + + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Br_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Bt_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Br_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Bt_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + + return Ex, Ey, Ez, Bx, By, Bz + +# ----------------------- +# Field gathering cubic +# ----------------------- + +@numba.njit(parallel=True) +def gather_field_prange_cubic(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba with multi-threading. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (cubic). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Deposit the field per cell in parallel + # (for threads < number of particles) + for i in prange(x.shape[0]): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particle, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the shape factors + Sr = [0.,0.,0.,0.] + ir = [0,0,0,0] + ir[0] = int64(math.floor(r_cell)) - 1 + ir[1] = ir[0] + 1 + ir[2] = ir[1] + 1 + ir[3] = ir[2] + 1 + Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3 + Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) + Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) + Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 + iz = [0.,0.,0.,0.] + Sz = [0,0,0,0] + iz[0] = int64(math.floor(z_cell)) - 1 + iz[1] = iz[0] + 1 + iz[2] = iz[1] + 1 + iz[3] = iz[2] + 1 + Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3 + Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4) + Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) + Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 + # Lower and upper periodic boundary for z + for index_z in range(4): + if iz[index_z] < 0: + iz[index_z] += Nz + if iz[index_z] > Nz - 1: + iz[index_z] -= Nz + # Lower and upper boundary for r + for index_r in range(4): + if ir[index_r] < 0: + ir[index_r] = abs(ir[index_r])-1 + Sr[index_r] = (-1.)*Sr[index_r] + if ir[index_r] > Nr - 1: + ir[index_r] = Nr - 1 + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + for index_r in range(4): + for index_z in range(4): + Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + for index_r in range(4): + for index_z in range(4): + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = (cos*Fr - sin*Ft) + Ey[i] = (sin*Fr + cos*Ft) + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + for index_r in range(4): + for index_z in range(4): + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + + # Add the fields for mode 1 + for index_r in range(4): + for index_z in range(4): + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + + return Ex, Ey, Ez, Bx, By, Bz \ No newline at end of file diff --git a/fbpic/particles/push/__init__.py b/fbpic/particles/push/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/push/cuda_methods.py b/fbpic/particles/push/cuda_methods.py new file mode 100644 index 000000000..41d48d12a --- /dev/null +++ b/fbpic/particles/push/cuda_methods.py @@ -0,0 +1,191 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle push methods on the GPU using CUDA. +""" +from numba import cuda +import math +from scipy.constants import c, e + +@cuda.jit('void(float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64)') +def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) : + """ + Advance the particles' positions over one half-timestep + + This assumes that the positions (x, y, z) are initially either + one half-timestep *behind* the momenta (ux, uy, uz), or at the + same timestep as the momenta. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + (is modified by this function) + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + dt : float (seconds) + The time by which the position is advanced + """ + # Half timestep, multiplied by c + chdt = c*0.5*dt + + i = cuda.grid(1) + if i < x.shape[0]: + # Particle push + inv_g = inv_gamma[i] + x[i] += chdt*inv_g*ux[i] + y[i] += chdt*inv_g*uy[i] + z[i] += chdt*inv_g*uz[i] + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64, float64, int32, float64)') +def push_p_gpu( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, + q, m, Ntot, dt ) : + """ + Advance the particles' momenta, using cuda on the GPU + + Parameters + ---------- + ux, uy, uz : 1darray of floats + The velocity of the particles + (is modified by this function) + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + + q : float + The charge of the particle species + + m : float + The mass of the particle species + + Ntot : int + The total number of particles + + dt : float + The time by which the momenta is advanced + """ + # Set a few constants + econst = q*dt/(m*c) + bconst = 0.5*q*dt/m + + #Cuda 1D grid + ip = cuda.grid(1) + + # Loop over the particles + if ip < Ntot: + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) + +@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64[:], float64[:], float64[:], \ + float64, int32, float64, int16[:])') +def push_p_ioniz_gpu( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, + m, Ntot, dt, ionization_level ) : + """ + Advance the particles' momenta, using numba on the GPU + This take into account that the particles are ionizable, and thus + that their charge is determined by `ionization_level` + + Parameters + ---------- + ux, uy, uz : 1darray of floats + The velocity of the particles + (is modified by this function) + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + + m : float + The mass of the particle species + + Ntot : int + The total number of particles + + dt : float + The time by which the momenta is advanced + + ionization_level : 1darray of ints + The number of electrons that each ion is missing + (compared to a neutral atom) + """ + #Cuda 1D grid + ip = cuda.grid(1) + + # Loop over the particles + if ip < Ntot: + if ionization_level[ip] != 0: + # Set a few constants + econst = ionization_level[ip] * e * dt/(m*c) + bconst = 0.5 * ionization_level[ip] * e * dt/m + # Use the Vay pusher + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst) + +@cuda.jit(device=True, inline=True) +def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, + Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): + """ + Push at single macroparticle, using the Vay pusher + """ + # Get the magnetic rotation vector + taux = bconst*Bx + tauy = bconst*By + tauz = bconst*Bz + tau2 = taux**2 + tauy**2 + tauz**2 + + # Get the momenta at the half timestep + uxp = ux_i + econst*Ex \ + + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) + uyp = uy_i + econst*Ey \ + + inv_gamma_i*( uz_i*taux - ux_i*tauz ) + uzp = uz_i + econst*Ez \ + + inv_gamma_i*( ux_i*tauy - uy_i*taux ) + sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 + utau = uxp*taux + uyp*tauy + uzp*tauz + + # Get the new 1./gamma + inv_gamma_f = math.sqrt( + 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) + + # Reuse the tau and utau arrays to save memory + tx = inv_gamma_f*taux + ty = inv_gamma_f*tauy + tz = inv_gamma_f*tauz + ut = inv_gamma_f*utau + s = 1./( 1 + tau2*inv_gamma_f**2 ) + + # Get the new u + ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) + uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) + uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) + + return( ux_f, uy_f, uz_f, inv_gamma_f ) \ No newline at end of file diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py new file mode 100644 index 000000000..6e5807085 --- /dev/null +++ b/fbpic/particles/push/numba_methods.py @@ -0,0 +1,110 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle push methods on the CPU with numba. +""" +import numba +import math +from scipy.constants import c, e + +@numba.njit +def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): + """ + Advance the particles' positions over one half-timestep + + This assumes that the positions (x, y, z) are initially either + one half-timestep *behind* the momenta (ux, uy, uz), or at the + same timestep as the momenta. + """ + # Half timestep, multiplied by c + chdt = c*0.5*dt + + # Particle push + for ip in range(Ntot) : + x[ip] += chdt * inv_gamma[ip] * ux[ip] + y[ip] += chdt * inv_gamma[ip] * uy[ip] + z[ip] += chdt * inv_gamma[ip] * uz[ip] + +@numba.njit +def push_p_numba( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : + """ + Advance the particles' momenta, using numba + """ + # Set a few constants + econst = q*dt/(m*c) + bconst = 0.5*q*dt/m + + # Loop over the particles + for ip in range(Ntot) : + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) + +@numba.njit +def push_p_ioniz_numba( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : + """ + Advance the particles' momenta, using numba + """ + # Set a few constants + prefactor_econst = e*dt/(m*c) + prefactor_bconst = 0.5*e*dt/m + + # Loop over the particles + for ip in range(Ntot) : + + # For neutral macroparticles, skip this step + if ionization_level[ip] == 0: + continue + + # Calculate the charge dependent constants + econst = prefactor_econst * ionization_level[ip] + bconst = prefactor_bconst * ionization_level[ip] + # Perform the push + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], + econst, bconst ) + +@numba.njit +def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, + Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): + """ + Push at single macroparticle, using the Vay pusher + """ + # Get the magnetic rotation vector + taux = bconst*Bx + tauy = bconst*By + tauz = bconst*Bz + tau2 = taux**2 + tauy**2 + tauz**2 + + # Get the momenta at the half timestep + uxp = ux_i + econst*Ex \ + + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) + uyp = uy_i + econst*Ey \ + + inv_gamma_i*( uz_i*taux - ux_i*tauz ) + uzp = uz_i + econst*Ez \ + + inv_gamma_i*( ux_i*tauy - uy_i*taux ) + sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 + utau = uxp*taux + uyp*tauy + uzp*tauz + + # Get the new 1./gamma + inv_gamma_f = math.sqrt( + 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) + + # Reuse the tau and utau variables to save memory + tx = inv_gamma_f*taux + ty = inv_gamma_f*tauy + tz = inv_gamma_f*tauz + ut = inv_gamma_f*utau + s = 1./( 1 + tau2*inv_gamma_f**2 ) + + # Get the new u + ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) + uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) + uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) + + return( ux_f, uy_f, uz_f, inv_gamma_f ) \ No newline at end of file diff --git a/fbpic/particles/push/threading_methods.py b/fbpic/particles/push/threading_methods.py new file mode 100644 index 000000000..06205dbfc --- /dev/null +++ b/fbpic/particles/push/threading_methods.py @@ -0,0 +1,114 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle push methods on the CPU with threading. +""" +import numba +from numba import prange +import math +from scipy.constants import c, e + +@numba.njit(parallel=True) +def push_x_prange( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): + """ + Advance the particles' positions over one half-timestep + + This assumes that the positions (x, y, z) are initially either + one half-timestep *behind* the momenta (ux, uy, uz), or at the + same timestep as the momenta. + """ + # Half timestep, multiplied by c + chdt = c*0.5*dt + + # Particle push in parallel + for ip in prange(Ntot) : + x[ip] += chdt * inv_gamma[ip] * ux[ip] + y[ip] += chdt * inv_gamma[ip] * uy[ip] + z[ip] += chdt * inv_gamma[ip] * uz[ip] + return x, y, z + +@numba.njit(parallel=True) +def push_p_prange( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : + """ + Advance the particles' momenta, using numba + """ + # Set a few constants + econst = q*dt/(m*c) + bconst = 0.5*q*dt/m + + # Loop over the particles in parallel + for ip in prange(Ntot) : + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) + return ux, uy, uz, inv_gamma + +@numba.njit(parallel=True) +def push_p_ioniz_prange( ux, uy, uz, inv_gamma, + Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : + """ + Advance the particles' momenta, using numba + """ + # Set a few constants + prefactor_econst = e*dt/(m*c) + prefactor_bconst = 0.5*e*dt/m + + # Loop over the particles in parallel + for ip in prange(Ntot) : + + # For neutral macroparticles, skip this step + if ionization_level[ip] == 0: + continue + + # Calculate the charge dependent constants + econst = prefactor_econst * ionization_level[ip] + bconst = prefactor_bconst * ionization_level[ip] + # Perform the push + ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( + ux[ip], uy[ip], uz[ip], inv_gamma[ip], + Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], + econst, bconst ) + return ux, uy, uz, inv_gamma + +@numba.njit +def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, + Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): + """ + Push at single macroparticle, using the Vay pusher + """ + # Get the magnetic rotation vector + taux = bconst*Bx + tauy = bconst*By + tauz = bconst*Bz + tau2 = taux**2 + tauy**2 + tauz**2 + + # Get the momenta at the half timestep + uxp = ux_i + econst*Ex \ + + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) + uyp = uy_i + econst*Ey \ + + inv_gamma_i*( uz_i*taux - ux_i*tauz ) + uzp = uz_i + econst*Ez \ + + inv_gamma_i*( ux_i*tauy - uy_i*taux ) + sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 + utau = uxp*taux + uyp*tauy + uzp*tauz + + # Get the new 1./gamma + inv_gamma_f = math.sqrt( + 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) + + # Reuse the tau and utau variables to save memory + tx = inv_gamma_f*taux + ty = inv_gamma_f*tauy + tz = inv_gamma_f*tauz + ut = inv_gamma_f*utau + s = 1./( 1 + tau2*inv_gamma_f**2 ) + + # Get the new u + ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) + uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) + uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) + + return( ux_f, uy_f, uz_f, inv_gamma_f ) \ No newline at end of file diff --git a/fbpic/particles/utilities/__init__.py b/fbpic/particles/utilities/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fbpic/particles/utilities/cuda_sorting.py b/fbpic/particles/utilities/cuda_sorting.py new file mode 100644 index 000000000..cb0a9a09f --- /dev/null +++ b/fbpic/particles/utilities/cuda_sorting.py @@ -0,0 +1,182 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the particle sorting methods on the GPU using CUDA. +""" +from numba import cuda +from accelerate.cuda import sorting + +# ----------------------------------------------------- +# Sorting utilities - get_cell_idx / sort / prefix_sum +# ----------------------------------------------------- + +@cuda.jit('void(int32[:], uint32[:], \ + float64[:], float64[:], float64[:], \ + float64, float64, int32, \ + float64, float64, int32)') +def get_cell_idx_per_particle(cell_idx, sorted_idx, + x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr): + """ + Get the cell index of each particle. + The cell index is 1d and calculated by: + cell index in z + cell index in r * number of cells in z. + The cell_idx of a particle is defined by + the lower cell in r and z, that it deposits its field to. + + Parameters + ---------- + cell_idx : 1darray of integers + The cell index of the particle + + sorted_idx : 1darray of integers + The sorted index array needs to be reset + before doing the sort + + x, y, z : 1darray of floats (in meters) + The position of the particles + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, in each direction + + Nz, Nr : int + Number of gridpoints along the considered direction + """ + i = cuda.grid(1) + if i < cell_idx.shape[0]: + # Preliminary arrays for the cylindrical conversion + xj = x[i] + yj = y[i] + zj = z[i] + rj = math.sqrt( xj**2 + yj**2 ) + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Original index of the uppper and lower cell + ir_lower = int(math.floor( r_cell )) + iz_lower = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_lower < 0: + ir_lower = 0 + # absorbing in upper r + if ir_lower > Nr-1: + ir_lower = Nr-1 + # periodic boundaries in z + if iz_lower < 0: + iz_lower += Nz + if iz_lower > Nz-1: + iz_lower -= Nz + + # Reset sorted_idx array + sorted_idx[i] = i + # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr + cell_idx[i] = ir_lower + iz_lower * Nr + +def sort_particles_per_cell(cell_idx, sorted_idx): + """ + Sort the cell index of the particles and + modify the sorted index array accordingly. + + Parameters + ---------- + cell_idx : 1darray of integers + The cell index of the particle + + sorted_idx : 1darray of integers + Represents the original index of the + particle before the sorting. + """ + Ntot = cell_idx.shape[0] + if Ntot > 0: + sorter = sorting.RadixSort(Ntot, dtype = np.int32) + sorter.sort(cell_idx, vals = sorted_idx) + +@cuda.jit('void(int32[:], int32[:])') +def incl_prefix_sum(cell_idx, prefix_sum): + """ + Perform an inclusive parallel prefix sum on the sorted + cell index array. The prefix sum array represents the + cumulative sum of the number of particles per cell + for each cell index. + + Parameters + ---------- + cell_idx : 1darray of integers + The cell index of the particle + + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + # i is the index of the macroparticle + i = cuda.grid(1) + if i < cell_idx.shape[0]-1: + # ci: index of the cell of the present macroparticle + ci = cell_idx[i] + # ci_next: index of the cell of the next macroparticle + ci_next = cell_idx[i+1] + # Fill all the cells between ci and ci_next with the + # inclusive cumulative sum of the number particles until ci + while ci < ci_next: + # The cumulative sum of the number of particle per cell + # until ci is i+1 (since i obeys python index, starting at 0) + prefix_sum[ci] = i+1 + ci += 1 + # The last "macroparticle" of the cell_idx array fills up the + # rest of the prefix sum array + if i == cell_idx.shape[0]-1: + # Get the cell_index of the last macroparticle + ci = cell_idx[i] + # Fill all the remaining entries of the prefix sum array + for empty_index in range(ci, prefix_sum.shape[0]): + prefix_sum[empty_index] = i+1 + +@cuda.jit('void(int32[:])') +def reset_prefix_sum(prefix_sum): + """ + Resets the prefix sum. Sets all the values + to zero. + + Parameters + ---------- + prefix_sum : 1darray of integers + Represents the cumulative sum of + the particles per cell + """ + i = cuda.grid(1) + if i < prefix_sum.shape[0]: + prefix_sum[i] = 0 + +@cuda.jit('void(uint32[:], float64[:], float64[:])') +def write_sorting_buffer(sorted_idx, val, buf): + """ + Writes the values of a particle array to a buffer, + while rearranging them to match the sorted cell index array. + + Parameters + ---------- + sorted_idx : 1darray of integers + Represents the original index of the + particle before the sorting + + val : 1d array of floats + A particle data array + + buf : 1d array of floats + A buffer array to temporarily store the + sorted particle data array + """ + i = cuda.grid(1) + if i < val.shape[0]: + buf[i] = val[sorted_idx[i]] diff --git a/fbpic/particles/utilities/utility_methods.py b/fbpic/particles/utilities/utility_methods.py new file mode 100644 index 000000000..d79019f0d --- /dev/null +++ b/fbpic/particles/utilities/utility_methods.py @@ -0,0 +1,155 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines particle utility methods. +""" +import numpy as np + +# ----------------------- +# Particle shapes utility +# ----------------------- + +def weights(x, invdx, offset, Nx, direction, shape_order): + """ + Return the array of cell indices and corresponding shape factors + for current/charge deposition and field gathering + + Parameters: + ----------- + x : 1darray of floats (in meters) + Array of particle positions along a given direction + (one element per macroparticle) + + invdx : float (in meters^-1) + Inverse of the grid step along the considered direction + + offset : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nx : int + Number of gridpoints along the considered direction + + direction : string + Determines the boundary conditions. Either 'r' or 'z' + + shape_order : int + Order of the shape factor. + Either 1 or 3 + + Returns: + -------- + A tuple containing : + + i: 2darray of ints + An array of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + (i.e. the number of elements in the array x) + This array contains the indices of the grid cells + (along the axis specified by `direction`) where each macroparticle + deposits charge/current and gathers field data. + + S: 2darray of floats + An array of shape (shape_order+1, Ntot) + where Ntot is the number of macroparticles + (i.e. the number of elements in the array x) + This array contains the shape factors (a.k.a. interpolation weights) + that correspond to each of the indices in the array `i`. + """ + # Positions of the particles, in the cell unit + x_cell = invdx*(x - offset) - 0.5 + + # Initialize empty arrays of the correct size + i = np.empty( (shape_order+1, len(x)), dtype=np.int64) + S = np.empty( (shape_order+1, len(x)), dtype=np.float64) + + # Indices and shapes + if shape_order == 1: + i[0,:] = np.floor(x_cell).astype('int') + i[1,:] = i[0,:] + 1 + # Linear weight + S[0,:] = i[1,:] - x_cell + S[1,:] = x_cell - i[0,:] + elif shape_order == 3: + i[0,:] = np.floor(x_cell).astype('int') - 1 + i[1,:] = i[0,:] + 1 + i[2,:] = i[0,:] + 2 + i[3,:] = i[0,:] + 3 + # Cubic Weights + S[0,:] = -1./6. * ((x_cell-i[0])-2)**3 + S[1,:] = 1./6. * (3*((x_cell-i[1])**3) - 6*((x_cell-i[1])**2)+4) + S[2,:] = 1./6. * (3*((i[2]-x_cell)**3) - 6*((i[2]-x_cell)**2)+4) + S[3,:] = -1./6. * ((i[3]-x_cell)-2)**3 + else: + raise ValueError("shapes other than linear and cubic are not supported yet.") + + # Periodic boundary conditions in z + if direction == 'z': + # Lower Bound Periodic + i = np.where( i < 0, i+Nx, i ) + # Upper Bound Periodic + i = np.where( i > Nx-1, i-Nx, i ) + # Absorbing boundary condition at the upper r boundary + elif direction == 'r': + i = np.where( i > Nx-1, Nx-1, i ) + # Note: The lower bound index shift for r is done in the gather + # and deposit methods because the sign changes. + # This avoids using specific guard cells. + else: + raise ValueError("Unrecognized `direction` : %s" % direction) + + # Return the result + return( i, S ) + +# ---------------------------- +# Angle initialization utility +# ---------------------------- + +def unalign_angles( thetap, Npz, Npr, method='irrational' ) : + """ + Shift the angles so that the particles are + not all aligned along the arms of a star transversely + + The fact that the particles are all aligned can produce + numerical artefacts, especially if the polarization of the laser + is aligned with this direction. + + Here, for each position in r and z, we add the *same* + shift for all the Nptheta particles that are at this position. + (This preserves the fact that certain modes are 0 initially.) + How this shift varies from one position to another depends on + the method chosen. + + Parameters + ---------- + thetap : 3darray of floats + An array of shape (Npr, Npz, Nptheta) containing the angular + positions of the particles, and which is modified by this function. + + Npz, Npr : ints + The number of macroparticles along the z and r directions + + method : string + Either 'random' or 'irrational' + """ + # Determine the angle shift + if method == 'random' : + angle_shift = 2*np.pi*np.random.rand(Npz, Npr) + elif method == 'irrational' : + # Subrandom sequence, by adding irrational number (sqrt(2) and sqrt(3)) + # This ensures that the sequence does not wrap around and induce + # correlations + shiftr = np.sqrt(2)*np.arange(Npr) + shiftz = np.sqrt(3)*np.arange(Npz) + angle_shift = 2*np.pi*( shiftz[:,np.newaxis] + shiftr[np.newaxis,:] ) + angle_shift = np.mod( angle_shift, 2*np.pi ) + else : + raise ValueError( + "method must be either 'random' or 'irrational' but is %s" %method ) + + # Add the angle shift to thetap + # np.newaxis ensures that the angles that are at the same positions + # in r and z have the same shift + thetap[:,:,:] = thetap[:,:,:] + angle_shift[:,:, np.newaxis] From 8f2e3ff8523c132faf522c77d85bbb524ee07339 Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Fri, 14 Jul 2017 19:05:43 +0200 Subject: [PATCH 03/36] Fix pyflakes errors --- fbpic/particles/deposition/numba_methods.py | 1 - fbpic/particles/gathering/cuda_methods.py | 2 - fbpic/particles/particles.py | 41 +++++++++++---------- fbpic/particles/utilities/cuda_sorting.py | 2 + 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/fbpic/particles/deposition/numba_methods.py b/fbpic/particles/deposition/numba_methods.py index 4ad61f9f9..54fedeb39 100644 --- a/fbpic/particles/deposition/numba_methods.py +++ b/fbpic/particles/deposition/numba_methods.py @@ -7,7 +7,6 @@ order shapes on the CPU with numba. """ import numba -from scipy.constants import c, e @numba.njit def deposit_field_numba(Fptcl, Fgrid, diff --git a/fbpic/particles/gathering/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py index 6bd5a6c0c..392d3ef52 100644 --- a/fbpic/particles/gathering/cuda_methods.py +++ b/fbpic/particles/gathering/cuda_methods.py @@ -8,8 +8,6 @@ """ from numba import cuda, float64, int64 import math -from scipy.constants import c, e -import numpy as np # ----------------------- # Field gathering linear diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 0094e305e..73197b2b5 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -10,7 +10,6 @@ from .ionization import Ionizer from .tracking import ParticleTracker import numba -import math # Load the utility methods from .utilities.utility_methods import weights, unalign_angles @@ -786,13 +785,14 @@ def deposit( self, fld, fieldtype ) : grid[0].rho, grid[1].rho, self.nthreads, tx_chunks, tx_N ) elif self.particle_shape == 'cubic': - deposit_rho_prange_cubic( - self.x, self.y, self.z, self.w, - grid[0].invdz, grid[0].zmin, grid[0].Nz, - grid[0].invdr, grid[0].rmin, grid[0].Nr, - rho_m0_global, rho_m1_global, - grid[0].rho, grid[1].rho, - self.nthreads, tx_chunks, tx_N ) + print('Not yet implemented') + # deposit_rho_prange_cubic( + # self.x, self.y, self.z, self.w, + # grid[0].invdz, grid[0].zmin, grid[0].Nz, + # grid[0].invdr, grid[0].rmin, grid[0].Nr, + # rho_m0_global, rho_m1_global, + # grid[0].rho, grid[1].rho, + # self.nthreads, tx_chunks, tx_N ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ @@ -836,18 +836,19 @@ def deposit( self, fld, fieldtype ) : grid[0].Jz, grid[1].Jz, self.nthreads, tx_chunks, tx_N ) elif self.particle_shape == 'cubic': - deposit_J_prange_cubic( - self.x, self.y, self.z, self.w, - self.ux, self.uy, self.uz, self.inv_gamma, - grid[0].invdz, grid[0].zmin, grid[0].Nz, - grid[0].invdr, grid[0].rmin, grid[0].Nr, - Jr_m0_global, Jr_m1_global, - Jt_m0_global, Jt_m1_global, - Jz_m0_global, Jz_m1_global, - grid[0].Jr, grid[1].Jr, - grid[0].Jt, grid[1].Jt, - grid[0].Jz, grid[1].Jz, - self.nthreads, tx_chunks, tx_N ) + print('Not yet implemented') + # deposit_J_prange_cubic( + # self.x, self.y, self.z, self.w, + # self.ux, self.uy, self.uz, self.inv_gamma, + # grid[0].invdz, grid[0].zmin, grid[0].Nz, + # grid[0].invdr, grid[0].rmin, grid[0].Nr, + # Jr_m0_global, Jr_m1_global, + # Jt_m0_global, Jt_m1_global, + # Jz_m0_global, Jz_m1_global, + # grid[0].Jr, grid[1].Jr, + # grid[0].Jt, grid[1].Jt, + # grid[0].Jz, grid[1].Jz, + # self.nthreads, tx_chunks, tx_N ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ diff --git a/fbpic/particles/utilities/cuda_sorting.py b/fbpic/particles/utilities/cuda_sorting.py index cb0a9a09f..45a1ddb9f 100644 --- a/fbpic/particles/utilities/cuda_sorting.py +++ b/fbpic/particles/utilities/cuda_sorting.py @@ -7,6 +7,8 @@ """ from numba import cuda from accelerate.cuda import sorting +import math +import numpy as np # ----------------------------------------------------- # Sorting utilities - get_cell_idx / sort / prefix_sum From 380b326780bc20761003d9a675fe6d8262df99a2 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 14 Jul 2017 21:54:03 -0700 Subject: [PATCH 04/36] Print number of threads along with number of MPI procs --- fbpic/main.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fbpic/main.py b/fbpic/main.py index f33554966..e3ce1e975 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -228,7 +228,7 @@ def dens_func( z, r ) ... self.comm = BoundaryCommunicator( Nz, zmin, zmax, Nr, rmax, Nm, dt, boundaries, n_order, n_guard, n_damp, exchange_period, use_all_mpi_ranks ) - print_simulation_setup( self.comm, self.use_cuda ) + print_simulation_setup( self.comm, self.use_cuda, self.use_threading ) # Modify domain region zmin, zmax, p_zmin, p_zmax, Nz = \ self.comm.divide_into_domain(zmin, zmax, p_zmin, p_zmax) @@ -595,7 +595,7 @@ def progression_bar( i, Ntot, measured_start, Nbars=50, char='-'): sys.stdout.write(', %d:%02d:%02d left' % (h, m, s)) sys.stdout.flush() -def print_simulation_setup( comm, use_cuda ): +def print_simulation_setup( comm, use_cuda, use_threading ): """ Print message about the number of proc and whether it is using GPU or CPU. @@ -607,13 +607,20 @@ def print_simulation_setup( comm, use_cuda ): use_cuda: bool Whether the simulation is set up to use CUDA + + use_threading: bool + Whether the simulation is set up to use threads on CPU """ if comm.rank == 0: if use_cuda: message = "\nRunning FBPIC on GPU " else: message = "\nRunning FBPIC on CPU " - message += "with %d proc.\n" %comm.size + message += "with %d proc" %comm.size + if use_threading and not use_cuda: + message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS + message += ".\n" + print( message ) def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ): From af15196a8758221aaeef9638beb508922bc565f8 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sat, 15 Jul 2017 18:29:23 -0700 Subject: [PATCH 05/36] Swapped the order of global arrays + removed thread-local arrays --- .../particles/deposition/threading_methods.py | 90 +++++++------------ fbpic/particles/particles.py | 36 ++++---- 2 files changed, 49 insertions(+), 77 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 595d6bdb5..7f023f5b0 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -79,7 +79,6 @@ def deposit_rho_prange_linear(x, y, z, w, invdz, zmin, Nz, invdr, rmin, Nr, rho_m0_global, rho_m1_global, - rho_m0, rho_m1, nthreads, tx_chunks, tx_N): """ Deposition of the charge density rho using numba prange on the CPU. @@ -120,8 +119,6 @@ def deposit_rho_prange_linear(x, y, z, w, # Create thread_local helper arrays # FIXME! ( instead of using zeros_like, # it would be nicer to use np.zeros((Nz,Nr)) ) - rho_m0_thread = np.zeros_like( rho_m0 ) - rho_m1_thread = np.zeros_like( rho_m1 ) # Loop over all particles in thread chunk for idx in range( tx_chunks[tx] ): # Calculate thread local particle index @@ -203,21 +200,17 @@ def deposit_rho_prange_linear(x, y, z, w, if iz_cell+1 > Nz-1: shift_z -= Nz # Write to thread local arrays - rho_m0_thread[iz_cell, ir_cell] += R_m0_00 - rho_m1_thread[iz_cell, ir_cell] += R_m1_00 + rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00 + rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00 - rho_m0_thread[iz_cell+1 + shift_z, ir_cell] += R_m0_01 - rho_m1_thread[iz_cell+1 + shift_z, ir_cell] += R_m1_01 + rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01 + rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01 - rho_m0_thread[iz_cell, ir_cell+1 + shift_r] += R_m0_10 - rho_m1_thread[iz_cell, ir_cell+1 + shift_r] += R_m1_10 + rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10 + rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10 - rho_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 - rho_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 - - # Write thread local deposition arrays to global deposition arrays - rho_m0_global[:,:,tx] = rho_m0_thread - rho_m1_global[:,:,tx] = rho_m1_thread + rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 + rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 return @@ -233,9 +226,6 @@ def deposit_J_prange_linear(x, y, z, w, j_r_m0_global, j_r_m1_global, j_t_m0_global, j_t_m1_global, j_z_m0_global, j_z_m1_global, - j_r_m0, j_r_m1, - j_t_m0, j_t_m1, - j_z_m0, j_z_m1, nthreads, tx_chunks, tx_N): """ Deposition of the current density J using numba prange on the CPU. @@ -283,12 +273,6 @@ def deposit_J_prange_linear(x, y, z, w, # Create thread_local helper arrays # FIXME! ( instead of using zeros_like, # it would be nicer to use np.zeros((Nz,Nr)) ) - j_r_m0_thread = np.zeros_like( j_r_m0 ) - j_t_m0_thread = np.zeros_like( j_t_m0 ) - j_z_m0_thread = np.zeros_like( j_z_m0 ) - j_r_m1_thread = np.zeros_like( j_r_m1 ) - j_t_m1_thread = np.zeros_like( j_t_m1 ) - j_z_m1_thread = np.zeros_like( j_z_m1 ) # Loop over all particles in thread chunk for idx in range( tx_chunks[tx] ): # Calculate thread local particle index @@ -407,48 +391,40 @@ def deposit_J_prange_linear(x, y, z, w, if (iz_cell+1) > Nz-1: shift_z -= Nz - j_r_m0_thread[iz_cell, ir_cell] += J_r_m0_00 - j_r_m1_thread[iz_cell, ir_cell] += J_r_m1_00 - - j_r_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 - j_r_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 + j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00 + j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00 - j_r_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 - j_r_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 + j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 + j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 - j_r_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 - j_r_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 + j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 + j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 - j_t_m0_thread[iz_cell, ir_cell] += J_t_m0_00 - j_t_m1_thread[iz_cell, ir_cell] += J_t_m1_00 + j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 + j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 - j_t_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 - j_t_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 + j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00 + j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00 - j_t_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 - j_t_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 + j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 + j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 - j_t_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 - j_t_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 + j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 + j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 - j_z_m0_thread[iz_cell, ir_cell] += J_z_m0_00 - j_z_m1_thread[iz_cell, ir_cell] += J_z_m1_00 + j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 + j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 - j_z_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 - j_z_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 + j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00 + j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00 - j_z_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 - j_z_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 + j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 + j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 - j_z_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 - j_z_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 + j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 + j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 - # Write thread local deposition arrays to global deposition arrays - j_r_m0_global[:,:,tx] = j_r_m0_thread - j_t_m0_global[:,:,tx] = j_t_m0_thread - j_z_m0_global[:,:,tx] = j_z_m0_thread - j_r_m1_global[:,:,tx] = j_r_m1_thread - j_t_m1_global[:,:,tx] = j_t_m1_thread - j_z_m1_global[:,:,tx] = j_z_m1_thread + j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 + j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 - return \ No newline at end of file + return diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 73197b2b5..fa4497cc7 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -770,10 +770,10 @@ def deposit( self, fld, fieldtype ) : if fieldtype == 'rho': # Generate temporary arrays for rho rho_m0_global = np.zeros( - (grid[0].rho.shape[0], grid[0].rho.shape[1], self.nthreads), + (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]), dtype=grid[0].rho.dtype ) rho_m1_global = np.zeros( - (grid[1].rho.shape[0], grid[1].rho.shape[1], self.nthreads), + (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]), dtype=grid[1].rho.dtype ) # Deposit rho using CPU threading if self.particle_shape == 'linear': @@ -782,7 +782,6 @@ def deposit( self, fld, fieldtype ) : grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, rho_m0_global, rho_m1_global, - grid[0].rho, grid[1].rho, self.nthreads, tx_chunks, tx_N ) elif self.particle_shape == 'cubic': print('Not yet implemented') @@ -798,28 +797,28 @@ def deposit( self, fld, fieldtype ) : 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) # Sum thread-local results to main field array - grid[0].rho = np.sum(rho_m0_global, axis=2) - grid[1].rho = np.sum(rho_m1_global, axis=2) + grid[0].rho = np.sum(rho_m0_global, axis=0) + grid[1].rho = np.sum(rho_m1_global, axis=0) elif fieldtype == 'J': # Generate temporary arrays for J Jr_m0_global = np.zeros( - (grid[0].Jr.shape[0], grid[0].Jr.shape[1], self.nthreads), + (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]), dtype=grid[0].Jr.dtype ) Jt_m0_global = np.zeros( - (grid[0].Jt.shape[0], grid[0].Jt.shape[1], self.nthreads), + (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]), dtype=grid[0].Jt.dtype ) Jz_m0_global = np.zeros( - (grid[0].Jz.shape[0], grid[0].Jz.shape[1], self.nthreads), + (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]), dtype=grid[0].Jz.dtype ) Jr_m1_global = np.zeros( - (grid[1].Jr.shape[0], grid[1].Jr.shape[1], self.nthreads), + (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]), dtype=grid[1].Jr.dtype ) Jt_m1_global = np.zeros( - (grid[1].Jt.shape[0], grid[1].Jt.shape[1], self.nthreads), + (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]), dtype=grid[1].Jt.dtype ) Jz_m1_global = np.zeros( - (grid[1].Jz.shape[0], grid[1].Jz.shape[1], self.nthreads), + (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]), dtype=grid[1].Jz.dtype ) # Deposit J using CPU threading if self.particle_shape == 'linear': @@ -831,9 +830,6 @@ def deposit( self, fld, fieldtype ) : Jr_m0_global, Jr_m1_global, Jt_m0_global, Jt_m1_global, Jz_m0_global, Jz_m1_global, - grid[0].Jr, grid[1].Jr, - grid[0].Jt, grid[1].Jt, - grid[0].Jz, grid[1].Jz, self.nthreads, tx_chunks, tx_N ) elif self.particle_shape == 'cubic': print('Not yet implemented') @@ -854,12 +850,12 @@ def deposit( self, fld, fieldtype ) : 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) # Sum thread-local results to main field array - grid[0].Jr = np.sum(Jr_m0_global, axis=2) - grid[0].Jt = np.sum(Jt_m0_global, axis=2) - grid[0].Jz = np.sum(Jz_m0_global, axis=2) - grid[1].Jr = np.sum(Jr_m1_global, axis=2) - grid[1].Jt = np.sum(Jt_m1_global, axis=2) - grid[1].Jz = np.sum(Jz_m1_global, axis=2) + grid[0].Jr = np.sum(Jr_m0_global, axis=0) + grid[0].Jt = np.sum(Jt_m0_global, axis=0) + grid[0].Jz = np.sum(Jz_m0_global, axis=0) + grid[1].Jr = np.sum(Jr_m1_global, axis=0) + grid[1].Jt = np.sum(Jt_m1_global, axis=0) + grid[1].Jz = np.sum(Jz_m1_global, axis=0) else: raise ValueError("`fieldtype` should be either 'J' or \ From 1e252be4d50daf7aceaf6b962c6639400b6fe02f Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sat, 15 Jul 2017 18:49:48 -0700 Subject: [PATCH 06/36] Fix pyflakes errors --- fbpic/particles/deposition/threading_methods.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 7f023f5b0..04b46276a 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -10,7 +10,6 @@ from numba import prange, int64 import math from scipy.constants import c -import numpy as np # ------------------------------- # Particle shape Factor functions From f9ce67927d3ce134ccb7a2cc6b6e254c482b15f3 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sun, 16 Jul 2017 12:54:16 -0700 Subject: [PATCH 07/36] Corrected import pattern in laser antenna --- fbpic/lpa_utils/laser/antenna.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fbpic/lpa_utils/laser/antenna.py b/fbpic/lpa_utils/laser/antenna.py index 3f3e8425c..d1aaf10b0 100644 --- a/fbpic/lpa_utils/laser/antenna.py +++ b/fbpic/lpa_utils/laser/antenna.py @@ -10,8 +10,8 @@ from scipy.constants import e, c, epsilon_0, physical_constants r_e = physical_constants['classical electron radius'][0] from .profiles import gaussian_profile -from fbpic.particles.utility_methods import weights -from fbpic.particles.numba_methods import deposit_field_numba +from fbpic.particles.utilities.utility_methods import weights +from fbpic.particles.deposition.numba_methods import deposit_field_numba # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed From eedf0190df8aca9f2fdedb087a57cea1dc4014ce Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sun, 16 Jul 2017 13:12:51 -0700 Subject: [PATCH 08/36] Fix thread index calculation --- fbpic/particles/particles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index fa4497cc7..18422189f 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -764,7 +764,7 @@ def deposit( self, fld, fieldtype ) : # Register particle chunk size for each thread tx_N = int(self.Ntot/self.nthreads) tx_chunks = [ tx_N for k in range(self.nthreads) ] - tx_chunks[-1] = tx_chunks[-1] + (tx_N)%(self.nthreads) + tx_chunks[-1] = tx_chunks[-1] + self.Ntot%(self.nthreads) # Multithreading functions for the deposition of rho or J # for Mode 0 and 1 only. if fieldtype == 'rho': From 3a5f0f08f5c51c77fdb9c866db4a04ef05567626 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sun, 16 Jul 2017 21:31:09 -0700 Subject: [PATCH 09/36] Fix automated tests --- fbpic/particles/particles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 18422189f..818e38632 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -764,7 +764,7 @@ def deposit( self, fld, fieldtype ) : # Register particle chunk size for each thread tx_N = int(self.Ntot/self.nthreads) tx_chunks = [ tx_N for k in range(self.nthreads) ] - tx_chunks[-1] = tx_chunks[-1] + self.Ntot%(self.nthreads) + tx_chunks[-1] = tx_chunks[-1] + int(self.Ntot%self.nthreads) # Multithreading functions for the deposition of rho or J # for Mode 0 and 1 only. if fieldtype == 'rho': From a3c22482378690129efe9e514028e04feb78b29e Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sun, 16 Jul 2017 21:29:31 -0700 Subject: [PATCH 10/36] Implement parallel reduce --- .../particles/deposition/threading_methods.py | 27 ++++++++++++++++--- fbpic/particles/particles.py | 18 ++++++------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 04b46276a..02c8d7d9f 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -12,7 +12,7 @@ from scipy.constants import c # ------------------------------- -# Particle shape Factor functions +# Particle shape Factor functions # ------------------------------- # Linear shapes @@ -116,7 +116,7 @@ def deposit_rho_prange_linear(x, y, z, w, # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): # Create thread_local helper arrays - # FIXME! ( instead of using zeros_like, + # FIXME! ( instead of using zeros_like, # it would be nicer to use np.zeros((Nz,Nr)) ) # Loop over all particles in thread chunk for idx in range( tx_chunks[tx] ): @@ -270,7 +270,7 @@ def deposit_J_prange_linear(x, y, z, w, # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): # Create thread_local helper arrays - # FIXME! ( instead of using zeros_like, + # FIXME! ( instead of using zeros_like, # it would be nicer to use np.zeros((Nz,Nr)) ) # Loop over all particles in thread chunk for idx in range( tx_chunks[tx] ): @@ -427,3 +427,24 @@ def deposit_J_prange_linear(x, y, z, w, j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 return + +# ----------------------------------------------------------------------- +# Parallel reduction of the global arrays for threads into a single array +# ----------------------------------------------------------------------- + +@numba.njit( parallel=True ) +def sum_reduce_2d_array( global_array, reduced_array ): + """ + # TODO + """ + # Extract size of each dimension + Nreduce, Nz, Nr = global_array.shape + + # Parallel loop over iz + for iz in prange( Nz ): + # Loop over the reduction dimension (slow dimension) + for it in range( Nreduce ): + # Loop over ir (fast dimension) + for ir in range( Nr ): + + reduced_array[ iz, ir ] += global_array[ it, iz, ir ] diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 818e38632..d1893d38f 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -21,7 +21,7 @@ from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ push_x_prange from .deposition.threading_methods import deposit_rho_prange_linear, \ - deposit_J_prange_linear #CUBIC tbd + deposit_J_prange_linear, sum_reduce_2d_array #CUBIC tbd from .gathering.threading_methods import gather_field_prange_linear, \ gather_field_prange_cubic @@ -797,8 +797,8 @@ def deposit( self, fld, fieldtype ) : 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) # Sum thread-local results to main field array - grid[0].rho = np.sum(rho_m0_global, axis=0) - grid[1].rho = np.sum(rho_m1_global, axis=0) + sum_reduce_2d_array( rho_m0_global, grid[0].rho ) + sum_reduce_2d_array( rho_m1_global, grid[1].rho ) elif fieldtype == 'J': # Generate temporary arrays for J @@ -850,12 +850,12 @@ def deposit( self, fld, fieldtype ) : 'linear' or 'cubic' \ but is `%s`" % self.particle_shape) # Sum thread-local results to main field array - grid[0].Jr = np.sum(Jr_m0_global, axis=0) - grid[0].Jt = np.sum(Jt_m0_global, axis=0) - grid[0].Jz = np.sum(Jz_m0_global, axis=0) - grid[1].Jr = np.sum(Jr_m1_global, axis=0) - grid[1].Jt = np.sum(Jt_m1_global, axis=0) - grid[1].Jz = np.sum(Jz_m1_global, axis=0) + sum_reduce_2d_array( Jr_m0_global, grid[0].Jr ) + sum_reduce_2d_array( Jt_m0_global, grid[0].Jt ) + sum_reduce_2d_array( Jz_m0_global, grid[0].Jz ) + sum_reduce_2d_array( Jr_m1_global, grid[1].Jr ) + sum_reduce_2d_array( Jt_m1_global, grid[1].Jt ) + sum_reduce_2d_array( Jz_m1_global, grid[1].Jz ) else: raise ValueError("`fieldtype` should be either 'J' or \ From da28bbe3d70489e6cb42c337686fe71f68c08877 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Sun, 16 Jul 2017 23:42:45 -0700 Subject: [PATCH 11/36] Added docstring to the function --- fbpic/particles/deposition/threading_methods.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 02c8d7d9f..cb4a3e84a 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -435,7 +435,17 @@ def deposit_J_prange_linear(x, y, z, w, @numba.njit( parallel=True ) def sum_reduce_2d_array( global_array, reduced_array ): """ - # TODO + Sum the array `global_array` along its first axis and + add it into `reduced_array`. + + Parameters: + ----------- + global_array: 3darray of complexs + Field array whose first dimension corresponds to the + reduction dimension (typically: the number of threads used + during the current deposition) + + reduced array: 2darray of complexs """ # Extract size of each dimension Nreduce, Nz, Nr = global_array.shape From f70cca196102e155bf49f71b78dd0a98342ab2a6 Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Mon, 17 Jul 2017 17:11:23 +0200 Subject: [PATCH 12/36] Added cubic deposition functions --- .../particles/deposition/threading_methods.py | 1200 ++++++++++++++++- 1 file changed, 1149 insertions(+), 51 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index cb4a3e84a..3cab552be 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -12,7 +12,7 @@ from scipy.constants import c # ------------------------------- -# Particle shape Factor functions +# Particle shape Factor functions # ------------------------------- # Linear shapes @@ -78,13 +78,14 @@ def deposit_rho_prange_linear(x, y, z, w, invdz, zmin, Nz, invdr, rmin, Nr, rho_m0_global, rho_m1_global, - nthreads, tx_chunks, tx_N): + nthreads, tx_chunks): """ Deposition of the charge density rho using numba prange on the CPU. Iterates over the threads in parallel, while each thread iterates over a batch of particles. Intermediate results for each threads are stored in copies of the global grid. At the end of the parallel loop, - the thread-local field arrays are combined (summed) to the global array. + the thread-local field arrays are combined (summed) to a global array. + (This final reduction is *not* done in this function) Calculates the weighted amount of rho that is deposited to the 4 cells surounding the particle based on its shape (linear). @@ -99,9 +100,10 @@ def deposit_rho_prange_linear(x, y, z, w, w : 1d array of floats The weights of the particles - rho_m0, rho_m1 : 2darrays of complexs - The charge density on the interpolation grid for - mode 0 and 1. (is modified by this function) + rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local charge densities + on the interpolation grid for mode 0 and 1. + (is modified by this function) invdz, invdr : float (in meters^-1) Inverse of the grid step along the considered direction @@ -112,16 +114,20 @@ def deposit_rho_prange_linear(x, y, z, w, Nz, Nr : int Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. """ # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): - # Create thread_local helper arrays - # FIXME! ( instead of using zeros_like, - # it would be nicer to use np.zeros((Nz,Nr)) ) # Loop over all particles in thread chunk for idx in range( tx_chunks[tx] ): # Calculate thread local particle index - ptcl_idx = idx + tx*tx_N + ptcl_idx = idx + tx*tx_chunks[0] # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position @@ -175,10 +181,21 @@ def deposit_rho_prange_linear(x, y, z, w, # Boundary Region Shifts ir_flip = int( math.floor(r_cell) ) - R_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal + # Declare local field array + R_m0_00 = 0. + R_m0_01 = 0. + R_m0_10 = 0. + R_m0_11 = 0. + + R_m1_00 = 0. + 0.j + R_m1_01 = 0. + 0.j + R_m1_10 = 0. + 0.j + R_m1_11 = 0. + 0.j + + R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal if ir_flip == -1: R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal @@ -186,10 +203,10 @@ def deposit_rho_prange_linear(x, y, z, w, R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal else: - R_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal # Cell shifts for the simulation boundaries shift_r = 0 @@ -198,7 +215,8 @@ def deposit_rho_prange_linear(x, y, z, w, shift_r = -1 if iz_cell+1 > Nz-1: shift_z -= Nz - # Write to thread local arrays + + # Write ptcl fields to thread-local part of global deposition array rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00 rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00 @@ -225,13 +243,14 @@ def deposit_J_prange_linear(x, y, z, w, j_r_m0_global, j_r_m1_global, j_t_m0_global, j_t_m1_global, j_z_m0_global, j_z_m1_global, - nthreads, tx_chunks, tx_N): + nthreads, tx_chunks): """ Deposition of the current density J using numba prange on the CPU. Iterates over the threads in parallel, while each thread iterates over a batch of particles. Intermediate results for each threads are stored in copies of the global grid. At the end of the parallel loop, the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) Calculates the weighted amount of J that is deposited to the 4 cells surounding the particle based on its shape (linear). @@ -252,9 +271,9 @@ def deposit_J_prange_linear(x, y, z, w, inv_gamma : 1darray of floats The inverse of the relativistic gamma factor - j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs - The current component in each direction (r, t, z) - on the interpolation grid for mode 0 and 1. + j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local current component + in each direction (r, t, z) on the interpolation grid for mode 0 and 1. (is modified by this function) invdz, invdr : float (in meters^-1) @@ -266,16 +285,20 @@ def deposit_J_prange_linear(x, y, z, w, Nz, Nr : int Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. """ # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): - # Create thread_local helper arrays - # FIXME! ( instead of using zeros_like, - # it would be nicer to use np.zeros((Nz,Nr)) ) # Loop over all particles in thread chunk for idx in range( tx_chunks[tx] ): # Calculate thread local particle index - ptcl_idx = idx + tx*tx_N + ptcl_idx = idx + tx*tx_chunks[0] # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position @@ -341,18 +364,47 @@ def deposit_J_prange_linear(x, y, z, w, # Boundary Region Shifts ir_flip = int( math.floor(r_cell) ) - J_r_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal + # Declare local field arrays + J_r_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m0_00 = 0. + J_t_m1_00 = 0. + 0.j + J_z_m0_00 = 0. + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m0_01 = 0. + J_t_m1_01 = 0. + 0.j + J_z_m0_01 = 0. + J_z_m1_01 = 0. + 0.j + + J_r_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m0_10 = 0. + J_t_m1_10 = 0. + 0.j + J_z_m0_10 = 0. + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m0_11 = 0. + J_t_m1_11 = 0. + 0.j + J_z_m0_11 = 0. + J_z_m1_11 = 0. + 0.j + + J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal # Take into account lower r flips if ir_flip == -1: @@ -369,18 +421,18 @@ def deposit_J_prange_linear(x, y, z, w, J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal else: - J_r_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal # Cell shifts for the simulation boundaries shift_r = 0 @@ -390,6 +442,7 @@ def deposit_J_prange_linear(x, y, z, w, if (iz_cell+1) > Nz-1: shift_z -= Nz + # Write ptcl fields to thread-local part of global deposition array j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00 j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00 @@ -428,6 +481,1051 @@ def deposit_J_prange_linear(x, y, z, w, return + +# ------------------------------- +# Field deposition - cubic - rho +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_rho_prange_cubic(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0_global, rho_m1_global, + nthreads, tx_chunks): + """ + + Deposition of the charge density rho using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of rho that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local charge densities + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_chunks[0] + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Compute values in local copies and consider boundaries + ir_flip = int( math.floor(r_cell) ) - 1 + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode. + R_m0_00 = 0. + R_m1_00 = 0. + 0.j + + R_m0_01 = 0. + R_m1_01 = 0. + 0.j + + R_m0_02 = 0. + R_m1_02 = 0. + 0.j + + R_m0_03 = 0. + R_m1_03 = 0. + 0.j + + R_m0_10 = 0. + R_m1_10 = 0. + 0.j + + R_m0_11 = 0. + R_m1_11 = 0. + 0.j + + R_m0_12 = 0. + R_m1_12 = 0. + 0.j + + R_m0_13 = 0. + R_m1_13 = 0. + 0.j + + R_m0_20 = 0. + R_m1_20 = 0. + 0.j + + R_m0_21 = 0. + R_m1_21 = 0. + 0.j + + R_m0_22 = 0. + R_m1_22 = 0. + 0.j + + R_m0_23 = 0. + R_m1_23 = 0. + 0.j + + R_m0_30 = 0. + R_m1_30 = 0. + 0.j + + R_m0_31 = 0. + R_m1_31 = 0. + 0.j + + R_m0_32 = 0. + R_m1_32 = 0. + 0.j + + R_m0_33 = 0. + R_m1_33 = 0. + 0.j + + if (ir_flip == -2): + R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + if (ir_flip == -1): + R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + if (ir_flip >= 0): + R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Write ptcl fields to thread-local part of global deposition array + rho_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 + rho_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 + + rho_m0_global[iz_cell, ir_cell - 1 + srl] += R_m0_01 + rho_m1_global[iz_cell, ir_cell - 1 + srl] += R_m1_01 + + rho_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 + rho_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 + + rho_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 + rho_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 + + rho_m0_global[iz_cell - 1 + szl, ir_cell] += R_m0_10 + rho_m1_global[iz_cell - 1 + szl, ir_cell] += R_m1_10 + + rho_m0_global[iz_cell, ir_cell] += R_m0_11 + rho_m1_global[iz_cell, ir_cell] += R_m1_11 + + rho_m0_global[iz_cell + 1 + szu, ir_cell] += R_m0_12 + rho_m1_global[iz_cell + 1 + szu, ir_cell] += R_m1_12 + + rho_m0_global[iz_cell + 2 + szu2, ir_cell] += R_m0_13 + rho_m1_global[iz_cell + 2 + szu2, ir_cell] += R_m1_13 + + rho_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 + rho_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 + + rho_m0_global[iz_cell, ir_cell + 1 + sru] += R_m0_21 + rho_m1_global[iz_cell, ir_cell + 1 + sru] += R_m1_21 + + rho_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 + rho_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 + + rho_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 + rho_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 + + rho_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 + rho_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 + + rho_m0_global[iz_cell, ir_cell + 2 + sru2] += R_m0_31 + rho_m1_global[iz_cell, ir_cell + 2 + sru2] += R_m1_31 + + rho_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 + rho_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 + + rho_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 + rho_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 + + return + +# ------------------------------- +# Field deposition - cubic - J +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_J_prange_cubic(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0_global, j_r_m1_global, + j_t_m0_global, j_t_m1_global, + j_z_m0_global, j_z_m1_global, + nthreads, tx_chunks): + """ + Deposition of the current density J using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of J that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local current component + in each direction (r, t, z) on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_chunks[0] + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Compute values in local copies and consider boundaries + ir_flip = int64(math.floor(r_cell)) - 1 + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + J_r_m0_00 = 0. + J_t_m0_00 = 0. + J_z_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_t_m0_01 = 0. + J_z_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m1_01 = 0. + 0.j + J_z_m1_01 = 0. + 0.j + + J_r_m0_02 = 0. + J_t_m0_02 = 0. + J_z_m0_02 = 0. + J_r_m1_02 = 0. + 0.j + J_t_m1_02 = 0. + 0.j + J_z_m1_02 = 0. + 0.j + + J_r_m0_03 = 0. + J_t_m0_03 = 0. + J_z_m0_03 = 0. + J_r_m1_03 = 0. + 0.j + J_t_m1_03 = 0. + 0.j + J_z_m1_03 = 0. + 0.j + + J_r_m0_10 = 0. + J_t_m0_10 = 0. + J_z_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m1_10 = 0. + 0.j + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_t_m0_11 = 0. + J_z_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m1_11 = 0. + 0.j + J_z_m1_11 = 0. + 0.j + + J_r_m0_12 = 0. + J_t_m0_12 = 0. + J_z_m0_12 = 0. + J_r_m1_12 = 0. + 0.j + J_t_m1_12 = 0. + 0.j + J_z_m1_12 = 0. + 0.j + + J_r_m0_13 = 0. + J_t_m0_13 = 0. + J_z_m0_13 = 0. + J_r_m1_13 = 0. + 0.j + J_t_m1_13 = 0. + 0.j + J_z_m1_13 = 0. + 0.j + + J_r_m0_20 = 0. + J_t_m0_20 = 0. + J_z_m0_20 = 0. + J_r_m1_20 = 0. + 0.j + J_t_m1_20 = 0. + 0.j + J_z_m1_20 = 0. + 0.j + + J_r_m0_21 = 0. + J_t_m0_21 = 0. + J_z_m0_21 = 0. + J_r_m1_21 = 0. + 0.j + J_t_m1_21 = 0. + 0.j + J_z_m1_21 = 0. + 0.j + + J_r_m0_22 = 0. + J_t_m0_22 = 0. + J_z_m0_22 = 0. + J_r_m1_22 = 0. + 0.j + J_t_m1_22 = 0. + 0.j + J_z_m1_22 = 0. + 0.j + + J_r_m0_23 = 0. + J_t_m0_23 = 0. + J_z_m0_23 = 0. + J_r_m1_23 = 0. + 0.j + J_t_m1_23 = 0. + 0.j + J_z_m1_23 = 0. + 0.j + + J_r_m0_30 = 0. + J_t_m0_30 = 0. + J_z_m0_30 = 0. + J_r_m1_30 = 0. + 0.j + J_t_m1_30 = 0. + 0.j + J_z_m1_30 = 0. + 0.j + + J_r_m0_31 = 0. + J_t_m0_31 = 0. + J_z_m0_31 = 0. + J_r_m1_31 = 0. + 0.j + J_t_m1_31 = 0. + 0.j + J_z_m1_31 = 0. + 0.j + + J_r_m0_32 = 0. + J_t_m0_32 = 0. + J_z_m0_32 = 0. + J_r_m1_32 = 0. + 0.j + J_t_m1_32 = 0. + 0.j + J_z_m1_32 = 0. + 0.j + + J_r_m0_33 = 0. + J_t_m0_33 = 0. + J_z_m0_33 = 0. + J_r_m1_33 = 0. + 0.j + J_t_m1_33 = 0. + 0.j + J_z_m1_33 = 0. + 0.j + + if (ir_flip == -2): + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + + J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir_flip == -1): + J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir_flip >= 0): + J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + ] += srl = 1 + + j_r_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 + j_r_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 + j_r_m0_global[iz_cell, ir_cell - 1 + srl] += J_r_m0_01 + j_r_m1_global[iz_cell, ir_cell - 1 + srl] += J_r_m1_01 + j_r_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 + j_r_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 + j_r_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 + j_r_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 + j_r_m0_global[iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 + j_r_m1_global[iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 + j_r_m0_global[iz_cell, ir_cell] += J_r_m0_11 + j_r_m1_global[iz_cell, ir_cell] += J_r_m1_11 + j_r_m0_global[iz_cell + 1 + szu, ir_cell] += J_r_m0_12 + j_r_m1_global[iz_cell + 1 + szu, ir_cell] += J_r_m1_12 + j_r_m0_global[iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 + j_r_m1_global[iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 + j_r_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 + j_r_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 + j_r_m0_global[iz_cell, ir_cell + 1 + sru] += J_r_m0_21 + j_r_m1_global[iz_cell, ir_cell + 1 + sru] += J_r_m1_21 + j_r_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 + j_r_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 + j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 + j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 + j_r_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 + j_r_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 + j_r_m0_global[iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 + j_r_m1_global[iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 + j_r_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 + j_r_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 + j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 + j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 + + j_t_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 + j_t_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 + j_t_m0_global[iz_cell, ir_cell - 1 + srl] += J_t_m0_01 + j_t_m1_global[iz_cell, ir_cell - 1 + srl] += J_t_m1_01 + j_t_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 + j_t_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 + j_t_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 + j_t_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 + j_t_m0_global[iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 + j_t_m1_global[iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 + j_t_m0_global[iz_cell, ir_cell] += J_t_m0_11 + j_t_m1_global[iz_cell, ir_cell] += J_t_m1_11 + j_t_m0_global[iz_cell + 1 + szu, ir_cell] += J_t_m0_12 + j_t_m1_global[iz_cell + 1 + szu, ir_cell] += J_t_m1_12 + j_t_m0_global[iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 + j_t_m1_global[iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 + j_t_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 + j_t_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 + j_t_m0_global[iz_cell, ir_cell + 1 + sru] += J_t_m0_21 + j_t_m1_global[iz_cell, ir_cell + 1 + sru] += J_t_m1_21 + j_t_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 + j_t_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 + j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 + j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 + j_t_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 + j_t_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 + j_t_m0_global[iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 + j_t_m1_global[iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 + j_t_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 + j_t_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 + j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 + j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 + + j_z_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 + j_z_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 + j_z_m0_global[iz_cell, ir_cell - 1 + srl] += J_z_m0_01 + j_z_m1_global[iz_cell, ir_cell - 1 + srl] += J_z_m1_01 + j_z_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 + j_z_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 + j_z_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 + j_z_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 + j_z_m0_global[iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 + j_z_m1_global[iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 + j_z_m0_global[iz_cell, ir_cell] += J_z_m0_11 + j_z_m1_global[iz_cell, ir_cell] += J_z_m1_11 + j_z_m0_global[iz_cell + 1 + szu, ir_cell] += J_z_m0_12 + j_z_m1_global[iz_cell + 1 + szu, ir_cell] += J_z_m1_12 + j_z_m0_global[iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 + j_z_m1_global[iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 + j_z_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 + j_z_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 + j_z_m0_global[iz_cell, ir_cell + 1 + sru] += J_z_m0_21 + j_z_m1_global[iz_cell, ir_cell + 1 + sru] += J_z_m1_21 + j_z_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 + j_z_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 + j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 + j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 + j_z_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 + j_z_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 + j_z_m0_global[iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 + j_z_m1_global[iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 + j_z_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 + j_z_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 + j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 + j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 + + return + # ----------------------------------------------------------------------- # Parallel reduction of the global arrays for threads into a single array # ----------------------------------------------------------------------- From 2053e1f674a40b5f8c97aa49b188c124976721e0 Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Mon, 17 Jul 2017 17:14:47 +0200 Subject: [PATCH 13/36] Adapted particles.py for cubic prange deposition --- fbpic/particles/particles.py | 43 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index d1893d38f..a64434753 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -21,7 +21,8 @@ from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ push_x_prange from .deposition.threading_methods import deposit_rho_prange_linear, \ - deposit_J_prange_linear, sum_reduce_2d_array #CUBIC tbd + deposit_J_prange_linear, deposit_rho_prange_cubic, + deposit_J_prange_cubic, sum_reduce_2d_array from .gathering.threading_methods import gather_field_prange_linear, \ gather_field_prange_cubic @@ -782,16 +783,14 @@ def deposit( self, fld, fieldtype ) : grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, rho_m0_global, rho_m1_global, - self.nthreads, tx_chunks, tx_N ) + self.nthreads, tx_chunks ) elif self.particle_shape == 'cubic': - print('Not yet implemented') - # deposit_rho_prange_cubic( - # self.x, self.y, self.z, self.w, - # grid[0].invdz, grid[0].zmin, grid[0].Nz, - # grid[0].invdr, grid[0].rmin, grid[0].Nr, - # rho_m0_global, rho_m1_global, - # grid[0].rho, grid[1].rho, - # self.nthreads, tx_chunks, tx_N ) + deposit_rho_prange_cubic( + self.x, self.y, self.z, self.w, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + rho_m0_global, rho_m1_global, + self.nthreads, tx_chunks ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ @@ -830,21 +829,17 @@ def deposit( self, fld, fieldtype ) : Jr_m0_global, Jr_m1_global, Jt_m0_global, Jt_m1_global, Jz_m0_global, Jz_m1_global, - self.nthreads, tx_chunks, tx_N ) + self.nthreads, tx_chunks ) elif self.particle_shape == 'cubic': - print('Not yet implemented') - # deposit_J_prange_cubic( - # self.x, self.y, self.z, self.w, - # self.ux, self.uy, self.uz, self.inv_gamma, - # grid[0].invdz, grid[0].zmin, grid[0].Nz, - # grid[0].invdr, grid[0].rmin, grid[0].Nr, - # Jr_m0_global, Jr_m1_global, - # Jt_m0_global, Jt_m1_global, - # Jz_m0_global, Jz_m1_global, - # grid[0].Jr, grid[1].Jr, - # grid[0].Jt, grid[1].Jt, - # grid[0].Jz, grid[1].Jz, - # self.nthreads, tx_chunks, tx_N ) + deposit_J_prange_cubic( + self.x, self.y, self.z, self.w, + self.ux, self.uy, self.uz, self.inv_gamma, + grid[0].invdz, grid[0].zmin, grid[0].Nz, + grid[0].invdr, grid[0].rmin, grid[0].Nr, + Jr_m0_global, Jr_m1_global, + Jt_m0_global, Jt_m1_global, + Jz_m0_global, Jz_m1_global, + self.nthreads, tx_chunks ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ From 392354dff6009ebed9bfc5dee3cff2c5b89111d3 Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Mon, 17 Jul 2017 17:18:10 +0200 Subject: [PATCH 14/36] Removed linear_non_atomic shape from uniform_rho test --- tests/test_uniform_rho_deposition.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_uniform_rho_deposition.py b/tests/test_uniform_rho_deposition.py index e0a7978e6..8978f9625 100644 --- a/tests/test_uniform_rho_deposition.py +++ b/tests/test_uniform_rho_deposition.py @@ -10,7 +10,7 @@ (i.e. this confirms that no Verboncoeur-type correction is needed) - Shifting this plasma by a small amount in r, and still verifying that the deposited density is uniform - The tests are performed with different particle shapes: linear + The tests are performed with different particle shapes: linear, cubic Usage : from the top-level directory of FBPIC run @@ -47,7 +47,7 @@ def test_uniform_electron_plasma(show=False): "Function that is run by py.test, when doing `python setup.py test`" - for shape in ['cubic', 'linear', 'linear_non_atomic']: + for shape in ['linear', 'cubic']: uniform_electron_plasma( shape, show ) def uniform_electron_plasma(shape, show=False): @@ -87,7 +87,7 @@ def uniform_electron_plasma(shape, show=False): def test_neutral_plasma_shifted(show=False): "Function that is run by py.test, when doing `python setup.py test`" - for shape in ['cubic', 'linear', 'linear_non_atomic']: + for shape in ['linear', 'cubic']: neutral_plasma_shifted( shape, show ) def neutral_plasma_shifted(shape, show=False): From 705584511199a4061d1963509e5a27da3ce846cf Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Mon, 17 Jul 2017 17:23:46 +0200 Subject: [PATCH 15/36] Corrected some bugs introduced in last commits --- fbpic/particles/deposition/threading_methods.py | 2 +- fbpic/particles/particles.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 3cab552be..55d53bbf0 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -1423,7 +1423,7 @@ def deposit_J_prange_cubic(x, y, z, w, if (ir_cell+1) == (Nr - 1): sru2 = -1 if (ir_cell-1) < 0: - ] += srl = 1 + srl = 1 j_r_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 j_r_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index a64434753..cf8401d25 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -21,7 +21,7 @@ from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ push_x_prange from .deposition.threading_methods import deposit_rho_prange_linear, \ - deposit_J_prange_linear, deposit_rho_prange_cubic, + deposit_J_prange_linear, deposit_rho_prange_cubic, \ deposit_J_prange_cubic, sum_reduce_2d_array from .gathering.threading_methods import gather_field_prange_linear, \ gather_field_prange_cubic From 0a22108a1517fd8dc6fa46d67f9c1423a63116df Mon Sep 17 00:00:00 2001 From: Manuel Kirchen Date: Mon, 17 Jul 2017 19:00:21 +0200 Subject: [PATCH 16/36] Fix cubic deposition and cubic gathering --- .../particles/deposition/threading_methods.py | 275 +++++++++--------- fbpic/particles/gathering/cuda_methods.py | 2 +- .../particles/gathering/threading_methods.py | 49 +++- 3 files changed, 166 insertions(+), 160 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 55d53bbf0..11ecb54c8 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -780,53 +780,38 @@ def deposit_rho_prange_cubic(x, y, z, w, srl = 1 # Write ptcl fields to thread-local part of global deposition array - rho_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 - rho_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 - - rho_m0_global[iz_cell, ir_cell - 1 + srl] += R_m0_01 - rho_m1_global[iz_cell, ir_cell - 1 + srl] += R_m1_01 - - rho_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 - rho_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 - - rho_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 - rho_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 - - rho_m0_global[iz_cell - 1 + szl, ir_cell] += R_m0_10 - rho_m1_global[iz_cell - 1 + szl, ir_cell] += R_m1_10 - - rho_m0_global[iz_cell, ir_cell] += R_m0_11 - rho_m1_global[iz_cell, ir_cell] += R_m1_11 - - rho_m0_global[iz_cell + 1 + szu, ir_cell] += R_m0_12 - rho_m1_global[iz_cell + 1 + szu, ir_cell] += R_m1_12 - - rho_m0_global[iz_cell + 2 + szu2, ir_cell] += R_m0_13 - rho_m1_global[iz_cell + 2 + szu2, ir_cell] += R_m1_13 - - rho_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 - rho_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 - - rho_m0_global[iz_cell, ir_cell + 1 + sru] += R_m0_21 - rho_m1_global[iz_cell, ir_cell + 1 + sru] += R_m1_21 - - rho_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 - rho_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 - - rho_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 - rho_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 - - rho_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 - rho_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 - - rho_m0_global[iz_cell, ir_cell + 2 + sru2] += R_m0_31 - rho_m1_global[iz_cell, ir_cell + 2 + sru2] += R_m1_31 - - rho_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 - rho_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 - - rho_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 - rho_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 + rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01 + rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10 + rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11 + rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 + rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21 + rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 + rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31 + rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 return @@ -1425,104 +1410,104 @@ def deposit_J_prange_cubic(x, y, z, w, if (ir_cell-1) < 0: srl = 1 - j_r_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 - j_r_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 - j_r_m0_global[iz_cell, ir_cell - 1 + srl] += J_r_m0_01 - j_r_m1_global[iz_cell, ir_cell - 1 + srl] += J_r_m1_01 - j_r_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 - j_r_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 - j_r_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 - j_r_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 - j_r_m0_global[iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 - j_r_m1_global[iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 - j_r_m0_global[iz_cell, ir_cell] += J_r_m0_11 - j_r_m1_global[iz_cell, ir_cell] += J_r_m1_11 - j_r_m0_global[iz_cell + 1 + szu, ir_cell] += J_r_m0_12 - j_r_m1_global[iz_cell + 1 + szu, ir_cell] += J_r_m1_12 - j_r_m0_global[iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 - j_r_m1_global[iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 - j_r_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 - j_r_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 - j_r_m0_global[iz_cell, ir_cell + 1 + sru] += J_r_m0_21 - j_r_m1_global[iz_cell, ir_cell + 1 + sru] += J_r_m1_21 - j_r_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 - j_r_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 - j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 - j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 - j_r_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 - j_r_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 - j_r_m0_global[iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 - j_r_m1_global[iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 - j_r_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 - j_r_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 - j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 - j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 - - j_t_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 - j_t_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 - j_t_m0_global[iz_cell, ir_cell - 1 + srl] += J_t_m0_01 - j_t_m1_global[iz_cell, ir_cell - 1 + srl] += J_t_m1_01 - j_t_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 - j_t_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 - j_t_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 - j_t_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 - j_t_m0_global[iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 - j_t_m1_global[iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 - j_t_m0_global[iz_cell, ir_cell] += J_t_m0_11 - j_t_m1_global[iz_cell, ir_cell] += J_t_m1_11 - j_t_m0_global[iz_cell + 1 + szu, ir_cell] += J_t_m0_12 - j_t_m1_global[iz_cell + 1 + szu, ir_cell] += J_t_m1_12 - j_t_m0_global[iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 - j_t_m1_global[iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 - j_t_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 - j_t_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 - j_t_m0_global[iz_cell, ir_cell + 1 + sru] += J_t_m0_21 - j_t_m1_global[iz_cell, ir_cell + 1 + sru] += J_t_m1_21 - j_t_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 - j_t_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 - j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 - j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 - j_t_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 - j_t_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 - j_t_m0_global[iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 - j_t_m1_global[iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 - j_t_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 - j_t_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 - j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 - j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 - - j_z_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 - j_z_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 - j_z_m0_global[iz_cell, ir_cell - 1 + srl] += J_z_m0_01 - j_z_m1_global[iz_cell, ir_cell - 1 + srl] += J_z_m1_01 - j_z_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 - j_z_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 - j_z_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 - j_z_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 - j_z_m0_global[iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 - j_z_m1_global[iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 - j_z_m0_global[iz_cell, ir_cell] += J_z_m0_11 - j_z_m1_global[iz_cell, ir_cell] += J_z_m1_11 - j_z_m0_global[iz_cell + 1 + szu, ir_cell] += J_z_m0_12 - j_z_m1_global[iz_cell + 1 + szu, ir_cell] += J_z_m1_12 - j_z_m0_global[iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 - j_z_m1_global[iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 - j_z_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 - j_z_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 - j_z_m0_global[iz_cell, ir_cell + 1 + sru] += J_z_m0_21 - j_z_m1_global[iz_cell, ir_cell + 1 + sru] += J_z_m1_21 - j_z_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 - j_z_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 - j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 - j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 - j_z_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 - j_z_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 - j_z_m0_global[iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 - j_z_m1_global[iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 - j_z_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 - j_z_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 - j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 - j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 + j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01 + j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 + j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11 + j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 + j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21 + j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 + j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 + j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 + + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 + j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01 + j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 + j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11 + j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 + j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21 + j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 + j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 + j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 + + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 + j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01 + j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 + j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11 + j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 + j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21 + j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 + j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 + j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 return diff --git a/fbpic/particles/gathering/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py index 392d3ef52..12c763860 100644 --- a/fbpic/particles/gathering/cuda_methods.py +++ b/fbpic/particles/gathering/cuda_methods.py @@ -430,8 +430,8 @@ def gather_field_gpu_cubic(x, y, z, z_cell = invdz*(zj - zmin) - 0.5 # Calculate the shape factors - Sr = cuda.local.array((4,), dtype=float64) ir = cuda.local.array((4,), dtype=int64) + Sr = cuda.local.array((4,), dtype=float64) ir[0] = int64(math.floor(r_cell)) - 1 ir[1] = ir[0] + 1 ir[2] = ir[1] + 1 diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py index 73da99cfe..2c996a5ff 100644 --- a/fbpic/particles/gathering/threading_methods.py +++ b/fbpic/particles/gathering/threading_methods.py @@ -9,6 +9,7 @@ from numba import prange, int64 import numba import math +import numpy as np # ----------------------- # Field gathering linear @@ -411,8 +412,8 @@ def gather_field_prange_cubic(x, y, z, z_cell = invdz*(zj - zmin) - 0.5 # Calculate the shape factors - Sr = [0.,0.,0.,0.] - ir = [0,0,0,0] + ir = np.zeros(4, dtype=int64) #[0,0,0,0] + Sr = np.zeros(4) #[0.,0.,0.,0.] ir[0] = int64(math.floor(r_cell)) - 1 ir[1] = ir[0] + 1 ir[2] = ir[1] + 1 @@ -421,8 +422,8 @@ def gather_field_prange_cubic(x, y, z, Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 - iz = [0.,0.,0.,0.] - Sz = [0,0,0,0] + iz = np.zeros(4, dtype=int64) #[0,0,0,0] + Sz = np.zeros(4) #[0.,0.,0.,0.] iz[0] = int64(math.floor(z_cell)) - 1 iz[1] = iz[0] + 1 iz[2] = iz[1] + 1 @@ -432,18 +433,22 @@ def gather_field_prange_cubic(x, y, z, Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 # Lower and upper periodic boundary for z - for index_z in range(4): + index_z = 0 + while index_z < 4: if iz[index_z] < 0: iz[index_z] += Nz if iz[index_z] > Nz - 1: iz[index_z] -= Nz + index_z += 1 # Lower and upper boundary for r - for index_r in range(4): + index_r = 0 + while index_r < 4: if ir[index_r] < 0: ir[index_r] = abs(ir[index_r])-1 Sr[index_r] = (-1.)*Sr[index_r] if ir[index_r] > Nr - 1: ir[index_r] = Nr - 1 + index_r += 1 # E-Field # ---------------------------- @@ -461,8 +466,10 @@ def gather_field_prange_cubic(x, y, z, Ft_m = 0.j Fz_m = 0.j # Add the fields for mode 0 - for index_r in range(4): - for index_z in range(4): + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] if Sz[index_z]*Sr[index_r] < 0: @@ -471,6 +478,8 @@ def gather_field_prange_cubic(x, y, z, else: Fz_m += Sz[index_z]*Sr[index_r]* \ Ez_m0[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 Fr += (Fr_m*exptheta_m0).real Ft += (Ft_m*exptheta_m0).real @@ -484,8 +493,10 @@ def gather_field_prange_cubic(x, y, z, Ft_m = 0.j Fz_m = 0.j # Add the fields for mode 1 - for index_r in range(4): - for index_z in range(4): + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: if Sz[index_z]*Sr[index_r] < 0: Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ Er_m1[iz[index_z], ir[index_r]] @@ -497,6 +508,8 @@ def gather_field_prange_cubic(x, y, z, Ft_m += Sz[index_z]*Sr[index_r]* \ Et_m1[iz[index_z], ir[index_r]] Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 # Add the fields from the mode 1 Fr += 2*(Fr_m*exptheta_m1).real @@ -525,8 +538,10 @@ def gather_field_prange_cubic(x, y, z, Ft_m = 0.j Fz_m = 0.j # Add the fields for mode 0 - for index_r in range(4): - for index_z in range(4): + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: Fr_m += Sz[index_z]*Sr[index_r]* \ Br_m0[iz[index_z], ir[index_r]] Ft_m += Sz[index_z]*Sr[index_r]* \ @@ -537,6 +552,8 @@ def gather_field_prange_cubic(x, y, z, else: Fz_m += Sz[index_z]*Sr[index_r]* \ Bz_m0[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 # Add the fields from the mode 0 Fr += (Fr_m*exptheta_m0).real @@ -552,8 +569,10 @@ def gather_field_prange_cubic(x, y, z, Fz_m = 0.j # Add the fields for mode 1 - for index_r in range(4): - for index_z in range(4): + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: if Sz[index_z]*Sr[index_r] < 0: Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ Br_m1[iz[index_z], ir[index_r]] @@ -565,6 +584,8 @@ def gather_field_prange_cubic(x, y, z, Ft_m += Sz[index_z]*Sr[index_r]* \ Bt_m1[iz[index_z], ir[index_r]] Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 # Add the fields from the mode 1 Fr += 2*(Fr_m*exptheta_m1).real From c2f75b3889fc7a130e35feb7aea50dc8cc2ac124 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 21:55:46 -0700 Subject: [PATCH 17/36] Remove function signature in field methods --- fbpic/fields/numba_methods.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py index 830fec37a..b6c8d98a9 100644 --- a/fbpic/fields/numba_methods.py +++ b/fbpic/fields/numba_methods.py @@ -9,10 +9,7 @@ from scipy.constants import c, epsilon_0, mu_0 c2 = c**2 -@numba.jit('void(complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - float64[:,:], float64[:,:], float64[:,:], \ - float64, int32, int32)') +@numba.njit def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, inv_dt, Nz, Nr ): """ @@ -33,13 +30,7 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, Jm[iz, ir] += -0.5 * kr[iz, ir] * F Jz[iz, ir] += -1.j * kz[iz, ir] * F -@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], complex128[:,:], \ - complex128[:,:], complex128[:,:], \ - float64[:,:], float64[:,:], float64[:,:], \ - float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \ - int8, int32, int32)') +@numba.njit def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, @@ -106,7 +97,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \ + 1.j*kr[iz, ir]*Jm[iz, ir] ) -@numba.jit +@numba.njit def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, j_corr_coef, T_eb, T_cc, @@ -130,7 +121,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, Jm[iz, ir] += -0.5 * kr[iz, ir] * F Jz[iz, ir] += -1.j * kz[iz, ir] * F -@numba.jit +@numba.njit def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, From 18486c13d694301841a7214a3cfb6fb86919a6fc Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 22:14:56 -0700 Subject: [PATCH 18/36] Create threading_utils.py --- fbpic/threading_utils.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 fbpic/threading_utils.py diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py new file mode 100644 index 000000000..1fe2d4057 --- /dev/null +++ b/fbpic/threading_utils.py @@ -0,0 +1,15 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines a set of generic functions for multithreaded CPU execution. +""" +try: + # Try to import the threading function prange + from numba import prange + threading_installed = True +except ImportError: + # If not replace threading functions by single-thread functions + prange = range + threading_installed = False From fce66929e2e1cf9259dd52229f0c85d77d3e64b0 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 22:18:28 -0700 Subject: [PATCH 19/36] Check if threading is installed in main.py --- fbpic/main.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fbpic/main.py b/fbpic/main.py index e3ce1e975..a87604771 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -11,6 +11,8 @@ # as it sets the cuda context) from mpi4py import MPI import numba +# Check if threading is available +from .threading_utils import threading_installed # Check if CUDA is available, then import CUDA functions from .cuda_utils import cuda_installed if cuda_installed: @@ -44,8 +46,8 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax, n_order=-1, dens_func=None, filter_currents=True, v_comoving=None, use_galilean=False, initialize_ions=False, use_cuda=False, use_threading=True, nthreads=None, - n_guard=None, n_damp=30, exchange_period=None, - boundaries='periodic', gamma_boost=None, + n_guard=None, n_damp=30, exchange_period=None, + boundaries='periodic', gamma_boost=None, use_all_mpi_ranks=True, particle_shape='linear' ): """ Initializes a simulation, by creating the following structures: @@ -200,6 +202,10 @@ def dens_func( z, r ) ... self.use_cuda = False # CPU multi-threading self.use_threading = use_threading + if (use_threading) and (threading_installed=False): + print('*** Threading not available for the simulation.') + print('*** (Please make sure that numba>0.34 is installed)') + self.use_threading = False if self.use_threading: # Define number of threads used if nthreads is not None: @@ -620,7 +626,7 @@ def print_simulation_setup( comm, use_cuda, use_threading ): if use_threading and not use_cuda: message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS message += ".\n" - + print( message ) def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ): From b6c35844ea3b3fd0b3aabf0846a45aff33925144 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 22:22:59 -0700 Subject: [PATCH 20/36] Added threaded methods for the fields --- fbpic/fields/numba_methods.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py index b6c8d98a9..0428ae216 100644 --- a/fbpic/fields/numba_methods.py +++ b/fbpic/fields/numba_methods.py @@ -8,15 +8,16 @@ import numba from scipy.constants import c, epsilon_0, mu_0 c2 = c**2 +from fbpic.threading_utils import threading_installed, prange -@numba.njit +@numba.njit( parallel=threading_installed ) def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, inv_dt, Nz, Nr ): """ Correct the currents in spectral space, using the standard pstad """ # Loop over the 2D grid - for iz in range(Nz): + for iz in prange(Nz): for ir in range(Nr): # Calculate the intermediate variable F @@ -30,7 +31,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, Jm[iz, ir] += -0.5 * kr[iz, ir] * F Jz[iz, ir] += -1.j * kz[iz, ir] * F -@numba.njit + return + +@numba.njit( parallel=threading_installed ) def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, @@ -42,7 +45,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, See the documentation of SpectralGrid.push_eb_with """ # Loop over the 2D grid - for iz in range(Nz): + for iz in prange(Nz): for ir in range(Nr): # Save the electric fields, since it is needed for the B push @@ -97,7 +100,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \ + 1.j*kr[iz, ir]*Jm[iz, ir] ) -@numba.njit + return + +@numba.njit( parallel=threading_installed ) def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, j_corr_coef, T_eb, T_cc, @@ -107,7 +112,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, of comoving currents """ # Loop over the 2D grid - for iz in range(Nz): + for iz in prange(Nz): for ir in range(Nr): # Calculate the intermediate variable F @@ -121,7 +126,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, Jm[iz, ir] += -0.5 * kr[iz, ir] * F Jz[iz, ir] += -1.j * kz[iz, ir] * F -@numba.njit + return + +@numba.njit( parallel=threading_installed ) def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, @@ -198,3 +205,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, + 1.j*kr[iz, ir]*Em_old ) \ + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \ + 1.j*kr[iz, ir]*Jm[iz, ir] ) + + return From 0c52a62a7403412d24c9e583cc46c89bb6e7dee5 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 22:51:52 -0700 Subject: [PATCH 21/36] Added parallel capability for grid methods --- fbpic/fields/numba_methods.py | 16 +++++++------- fbpic/main.py | 2 +- fbpic/particles/particles.py | 39 +++++++++++++++++++---------------- fbpic/threading_utils.py | 4 ++++ 4 files changed, 34 insertions(+), 27 deletions(-) diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py index 0428ae216..f45e10a23 100644 --- a/fbpic/fields/numba_methods.py +++ b/fbpic/fields/numba_methods.py @@ -8,15 +8,15 @@ import numba from scipy.constants import c, epsilon_0, mu_0 c2 = c**2 -from fbpic.threading_utils import threading_installed, prange +from fbpic.threading_utils import njit_parallel, prange -@numba.njit( parallel=threading_installed ) +@njit_parallel def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, inv_dt, Nz, Nr ): """ Correct the currents in spectral space, using the standard pstad """ - # Loop over the 2D grid + # Loop over the 2D grid (parallel in z, if threading is installed) for iz in prange(Nz): for ir in range(Nr): @@ -33,7 +33,7 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz, return -@numba.njit( parallel=threading_installed ) +@njit_parallel def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, @@ -44,7 +44,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, See the documentation of SpectralGrid.push_eb_with """ - # Loop over the 2D grid + # Loop over the 2D grid (parallel in z, if threading is installed) for iz in prange(Nz): for ir in range(Nr): @@ -102,7 +102,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, return -@numba.njit( parallel=threading_installed ) +@njit_parallel def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, kz, kr, inv_k2, j_corr_coef, T_eb, T_cc, @@ -111,7 +111,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, Correct the currents in spectral space, using the assumption of comoving currents """ - # Loop over the 2D grid + # Loop over the 2D grid (parallel in z, if threading is installed) for iz in prange(Nz): for ir in range(Nr): @@ -128,7 +128,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz, return -@numba.njit( parallel=threading_installed ) +@njit_parallel def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz, rho_prev, rho_next, rho_prev_coef, rho_next_coef, j_coef, diff --git a/fbpic/main.py b/fbpic/main.py index a87604771..068491ffc 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -202,7 +202,7 @@ def dens_func( z, r ) ... self.use_cuda = False # CPU multi-threading self.use_threading = use_threading - if (use_threading) and (threading_installed=False): + if (use_threading) and (threading_installed==False): print('*** Threading not available for the simulation.') print('*** (Please make sure that numba>0.34 is installed)') self.use_threading = False diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index cf8401d25..ac9f2cdfd 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -17,14 +17,17 @@ from .push.numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba from .deposition.numba_methods import deposit_field_numba from .gathering.numba_methods import gather_field_numba -# Load the numba CPU multi-threading methods -from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ - push_x_prange -from .deposition.threading_methods import deposit_rho_prange_linear, \ - deposit_J_prange_linear, deposit_rho_prange_cubic, \ - deposit_J_prange_cubic, sum_reduce_2d_array -from .gathering.threading_methods import gather_field_prange_linear, \ - gather_field_prange_cubic + +# Check if threading is available, then import threaded functions +from fbpic.threading_utils import threading_installed +if threading_installed: + from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ + push_x_prange + from .deposition.threading_methods import deposit_rho_prange_linear, \ + deposit_J_prange_linear, deposit_rho_prange_cubic, \ + deposit_J_prange_cubic, sum_reduce_2d_array + from .gathering.threading_methods import gather_field_prange_linear, \ + gather_field_prange_cubic # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed @@ -505,7 +508,7 @@ def halfpush_x( self ) : elif self.use_threading: push_x_prange( self.x, self.y, self.z, self.ux, self.uy, self.uz, - self.inv_gamma, self.Ntot, self.dt ) + self.inv_gamma, self.Ntot, self.dt ) # CPU single-core version else: push_x_numba( self.x, self.y, self.z, @@ -763,7 +766,7 @@ def deposit( self, fld, fieldtype ) : # CPU multi-threading version elif self.use_threading: # Register particle chunk size for each thread - tx_N = int(self.Ntot/self.nthreads) + tx_N = int(self.Ntot/self.nthreads) tx_chunks = [ tx_N for k in range(self.nthreads) ] tx_chunks[-1] = tx_chunks[-1] + int(self.Ntot%self.nthreads) # Multithreading functions for the deposition of rho or J @@ -771,10 +774,10 @@ def deposit( self, fld, fieldtype ) : if fieldtype == 'rho': # Generate temporary arrays for rho rho_m0_global = np.zeros( - (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]), + (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]), dtype=grid[0].rho.dtype ) rho_m1_global = np.zeros( - (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]), + (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]), dtype=grid[1].rho.dtype ) # Deposit rho using CPU threading if self.particle_shape == 'linear': @@ -802,22 +805,22 @@ def deposit( self, fld, fieldtype ) : elif fieldtype == 'J': # Generate temporary arrays for J Jr_m0_global = np.zeros( - (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]), + (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]), dtype=grid[0].Jr.dtype ) Jt_m0_global = np.zeros( - (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]), + (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]), dtype=grid[0].Jt.dtype ) Jz_m0_global = np.zeros( - (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]), + (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]), dtype=grid[0].Jz.dtype ) Jr_m1_global = np.zeros( - (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]), + (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]), dtype=grid[1].Jr.dtype ) Jt_m1_global = np.zeros( - (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]), + (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]), dtype=grid[1].Jt.dtype ) Jz_m1_global = np.zeros( - (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]), + (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]), dtype=grid[1].Jz.dtype ) # Deposit J using CPU threading if self.particle_shape == 'linear': diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py index 1fe2d4057..28bf0e28b 100644 --- a/fbpic/threading_utils.py +++ b/fbpic/threading_utils.py @@ -5,11 +5,15 @@ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) It defines a set of generic functions for multithreaded CPU execution. """ +from numba import njit + try: # Try to import the threading function prange from numba import prange threading_installed = True + njit_parallel = njit( parallel=True ) except ImportError: # If not replace threading functions by single-thread functions prange = range threading_installed = False + njit_parallel = njit From 8458a0d81395165fbab00700291e499c270e4ce4 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 23:02:34 -0700 Subject: [PATCH 22/36] Removed threaded push methods --- fbpic/fields/numba_methods.py | 1 - fbpic/particles/particles.py | 23 +---- fbpic/particles/push/numba_methods.py | 29 +++--- fbpic/particles/push/threading_methods.py | 114 ---------------------- 4 files changed, 20 insertions(+), 147 deletions(-) delete mode 100644 fbpic/particles/push/threading_methods.py diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py index f45e10a23..e80652181 100644 --- a/fbpic/fields/numba_methods.py +++ b/fbpic/fields/numba_methods.py @@ -5,7 +5,6 @@ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) It defines the optimized fields methods that use numba on a CPU """ -import numba from scipy.constants import c, epsilon_0, mu_0 c2 = c**2 from fbpic.threading_utils import njit_parallel, prange diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index ac9f2cdfd..d3075e14f 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -21,8 +21,6 @@ # Check if threading is available, then import threaded functions from fbpic.threading_utils import threading_installed if threading_installed: - from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \ - push_x_prange from .deposition.threading_methods import deposit_rho_prange_linear, \ deposit_J_prange_linear, deposit_rho_prange_cubic, \ deposit_J_prange_cubic, sum_reduce_2d_array @@ -460,19 +458,7 @@ def push_p( self ) : self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) - # CPU multi-threading version - elif self.use_threading: - if self.ionizer is None: - push_p_prange(self.ux, self.uy, self.uz, self.inv_gamma, - self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, - self.q, self.m, self.Ntot, self.dt ) - else: - # Ionizable species can have a charge that depends on the - # macroparticle, and hence require a different function - push_p_ioniz_prange(self.ux, self.uy, self.uz, self.inv_gamma, - self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz, - self.m, self.Ntot, self.dt, self.ionizer.ionization_level ) - # CPU single-core version + # CPU version else: if self.ionizer is None: push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma, @@ -504,12 +490,7 @@ def halfpush_x( self ) : self.inv_gamma, self.dt ) # The particle array is unsorted after the push in x self.sorted = False - # CPU multi-threading version - elif self.use_threading: - push_x_prange( self.x, self.y, self.z, - self.ux, self.uy, self.uz, - self.inv_gamma, self.Ntot, self.dt ) - # CPU single-core version + # CPU version else: push_x_numba( self.x, self.y, self.z, self.ux, self.uy, self.uz, diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py index 6e5807085..d7b0817ba 100644 --- a/fbpic/particles/push/numba_methods.py +++ b/fbpic/particles/push/numba_methods.py @@ -5,11 +5,12 @@ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) It defines the particle push methods on the CPU with numba. """ -import numba import math +import numba +from fbpic.threading_utils import njit_parallel, prange from scipy.constants import c, e -@numba.njit +@njit_parallel def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): """ Advance the particles' positions over one half-timestep @@ -21,13 +22,15 @@ def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): # Half timestep, multiplied by c chdt = c*0.5*dt - # Particle push - for ip in range(Ntot) : + # Particle push (in parallel if threading is installed) + for ip in prange(Ntot) : x[ip] += chdt * inv_gamma[ip] * ux[ip] y[ip] += chdt * inv_gamma[ip] * uy[ip] z[ip] += chdt * inv_gamma[ip] * uz[ip] -@numba.njit + return + +@njit_parallel def push_p_numba( ux, uy, uz, inv_gamma, Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : """ @@ -37,13 +40,15 @@ def push_p_numba( ux, uy, uz, inv_gamma, econst = q*dt/(m*c) bconst = 0.5*q*dt/m - # Loop over the particles - for ip in range(Ntot) : + # Loop over the particles (in parallel if threading is installed) + for ip in prange(Ntot) : ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( ux[ip], uy[ip], uz[ip], inv_gamma[ip], Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) -@numba.njit + return + +@njit_parallel def push_p_ioniz_numba( ux, uy, uz, inv_gamma, Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : """ @@ -53,8 +58,8 @@ def push_p_ioniz_numba( ux, uy, uz, inv_gamma, prefactor_econst = e*dt/(m*c) prefactor_bconst = 0.5*e*dt/m - # Loop over the particles - for ip in range(Ntot) : + # Loop over the particles (in parallel if threading is installed) + for ip in prange(Ntot) : # For neutral macroparticles, skip this step if ionization_level[ip] == 0: @@ -69,6 +74,8 @@ def push_p_ioniz_numba( ux, uy, uz, inv_gamma, Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) + return + @numba.njit def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): @@ -107,4 +114,4 @@ def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) - return( ux_f, uy_f, uz_f, inv_gamma_f ) \ No newline at end of file + return( ux_f, uy_f, uz_f, inv_gamma_f ) diff --git a/fbpic/particles/push/threading_methods.py b/fbpic/particles/push/threading_methods.py deleted file mode 100644 index 06205dbfc..000000000 --- a/fbpic/particles/push/threading_methods.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the particle push methods on the CPU with threading. -""" -import numba -from numba import prange -import math -from scipy.constants import c, e - -@numba.njit(parallel=True) -def push_x_prange( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): - """ - Advance the particles' positions over one half-timestep - - This assumes that the positions (x, y, z) are initially either - one half-timestep *behind* the momenta (ux, uy, uz), or at the - same timestep as the momenta. - """ - # Half timestep, multiplied by c - chdt = c*0.5*dt - - # Particle push in parallel - for ip in prange(Ntot) : - x[ip] += chdt * inv_gamma[ip] * ux[ip] - y[ip] += chdt * inv_gamma[ip] * uy[ip] - z[ip] += chdt * inv_gamma[ip] * uz[ip] - return x, y, z - -@numba.njit(parallel=True) -def push_p_prange( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) : - """ - Advance the particles' momenta, using numba - """ - # Set a few constants - econst = q*dt/(m*c) - bconst = 0.5*q*dt/m - - # Loop over the particles in parallel - for ip in prange(Ntot) : - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) - return ux, uy, uz, inv_gamma - -@numba.njit(parallel=True) -def push_p_ioniz_prange( ux, uy, uz, inv_gamma, - Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) : - """ - Advance the particles' momenta, using numba - """ - # Set a few constants - prefactor_econst = e*dt/(m*c) - prefactor_bconst = 0.5*e*dt/m - - # Loop over the particles in parallel - for ip in prange(Ntot) : - - # For neutral macroparticles, skip this step - if ionization_level[ip] == 0: - continue - - # Calculate the charge dependent constants - econst = prefactor_econst * ionization_level[ip] - bconst = prefactor_bconst * ionization_level[ip] - # Perform the push - ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay( - ux[ip], uy[ip], uz[ip], inv_gamma[ip], - Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], - econst, bconst ) - return ux, uy, uz, inv_gamma - -@numba.njit -def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, - Ex, Ey, Ez, Bx, By, Bz, econst, bconst ): - """ - Push at single macroparticle, using the Vay pusher - """ - # Get the magnetic rotation vector - taux = bconst*Bx - tauy = bconst*By - tauz = bconst*Bz - tau2 = taux**2 + tauy**2 + tauz**2 - - # Get the momenta at the half timestep - uxp = ux_i + econst*Ex \ - + inv_gamma_i*( uy_i*tauz - uz_i*tauy ) - uyp = uy_i + econst*Ey \ - + inv_gamma_i*( uz_i*taux - ux_i*tauz ) - uzp = uz_i + econst*Ez \ - + inv_gamma_i*( ux_i*tauy - uy_i*taux ) - sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2 - utau = uxp*taux + uyp*tauy + uzp*tauz - - # Get the new 1./gamma - inv_gamma_f = math.sqrt( - 2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) ) - - # Reuse the tau and utau variables to save memory - tx = inv_gamma_f*taux - ty = inv_gamma_f*tauy - tz = inv_gamma_f*tauz - ut = inv_gamma_f*utau - s = 1./( 1 + tau2*inv_gamma_f**2 ) - - # Get the new u - ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty ) - uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz ) - uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx ) - - return( ux_f, uy_f, uz_f, inv_gamma_f ) \ No newline at end of file From 6317aa3ad7c51e884ec81a7c326f1838189508e0 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 23:12:12 -0700 Subject: [PATCH 23/36] Corrected push_x's return --- fbpic/particles/push/numba_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py index d7b0817ba..6bc1b1d18 100644 --- a/fbpic/particles/push/numba_methods.py +++ b/fbpic/particles/push/numba_methods.py @@ -28,7 +28,7 @@ def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ): y[ip] += chdt * inv_gamma[ip] * uy[ip] z[ip] += chdt * inv_gamma[ip] * uz[ip] - return + return x, y, z @njit_parallel def push_p_numba( ux, uy, uz, inv_gamma, From c817b44141190557d9c9f0ebae3d3411b7dcec50 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 18 Jul 2017 23:28:26 -0700 Subject: [PATCH 24/36] Correct push_p and push_x with return function --- fbpic/particles/particles.py | 8 +++++++- fbpic/particles/push/numba_methods.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index d3075e14f..300d07b12 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -130,12 +130,18 @@ def dens_func( z, r ) ... # Register the timestep self.dt = dt - # Define wether or not to use the GPU + # Define whether or not to use the GPU self.use_cuda = use_cuda if (self.use_cuda==True) and (cuda_installed==False) : print('*** Cuda not available for the particles.') print('*** Performing the particle operations on the CPU.') self.use_cuda = False + # Define whether or not to use threading + self.use_threading = use_threading + if (self.use_threading==True) and (threading_installed==False) : + print('*** Threading not available for the simulation.') + print('*** (Please make sure that numba>0.34 is installed)') + self.use_threading = False # Register the properties of the particles # (Necessary for the pusher, and when adding more particles later, ) diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py index 6bc1b1d18..6e3843e1b 100644 --- a/fbpic/particles/push/numba_methods.py +++ b/fbpic/particles/push/numba_methods.py @@ -46,7 +46,7 @@ def push_p_numba( ux, uy, uz, inv_gamma, ux[ip], uy[ip], uz[ip], inv_gamma[ip], Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) - return + return ux, uy, uz, inv_gamma @njit_parallel def push_p_ioniz_numba( ux, uy, uz, inv_gamma, @@ -74,7 +74,7 @@ def push_p_ioniz_numba( ux, uy, uz, inv_gamma, Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst ) - return + return ux, uy, uz, inv_gamma @numba.njit def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i, From f1f2ab2f9d18c7a5b1a95506ccdbcf4898d236aa Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Wed, 19 Jul 2017 07:46:14 -0700 Subject: [PATCH 25/36] Give the right threading flag to particles --- fbpic/particles/particles.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 300d07b12..136a79b33 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -234,7 +234,6 @@ def dens_func( z, r ) ... # Register boolean that records if the particles are sorted or not self.sorted = False # Register variables when using multithreading - self.use_threading = use_threading if self.use_threading == True: # Register number of threads self.nthreads = numba.config.NUMBA_NUM_THREADS From 54bd6f0bd46af8bdaba61da21959fbcd52efbec3 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Wed, 19 Jul 2017 12:02:49 -0700 Subject: [PATCH 26/36] Threaded the routines that convert from p/m to r/t components --- .../spectral_transformer.py | 29 +++++----- .../spectral_transform/threading_methods.py | 58 +++++++++++++++++++ 2 files changed, 74 insertions(+), 13 deletions(-) create mode 100644 fbpic/fields/spectral_transform/threading_methods.py diff --git a/fbpic/fields/spectral_transform/spectral_transformer.py b/fbpic/fields/spectral_transform/spectral_transformer.py index a9f99222f..af2d2858b 100644 --- a/fbpic/fields/spectral_transform/spectral_transformer.py +++ b/fbpic/fields/spectral_transform/spectral_transformer.py @@ -9,6 +9,7 @@ from .hankel import DHT from .fourier import FFT +from .threading_methods import numba_rt_to_pm, numba_pm_to_rt # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed if cuda_installed: @@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m, self.spect_buffer_r, self.spect_buffer_t ) else : # Combine them on the CPU - # (It is important to write the affectation in the following way, - # since self.spect_buffer_p and self.spect_buffer_r actually point - # to the same object, for memory economy) - self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \ - ( self.spect_buffer_p + self.spect_buffer_m), \ - 1.j*( self.spect_buffer_p - self.spect_buffer_m) + # (self.spect_buffer_r and self.spect_buffer_t are + # passed in the following line, in order to make things + # explicit, but they actually point to the same object + # as self.spect_buffer_p, self.spect_buffer_m, + # for economy of memory) + numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m, + self.spect_buffer_r, self.spect_buffer_t ) # Finally perform the FFT (along axis 0, which corresponds to z) self.fft.inverse_transform( self.spect_buffer_r, interp_array_r ) @@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t, self.spect_buffer_r, self.spect_buffer_t, self.spect_buffer_p, self.spect_buffer_m ) else : - # Combine them on the CPU - # (It is important to write the affectation in the following way, - # since self.spect_buffer_p and self.spect_buffer_r actually point - # to the same object, for memory economy.) - self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \ - 0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \ - 0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t ) + # Combine them on the GPU + # (self.spect_buffer_p and self.spect_buffer_m are + # passed in the following line, in order to make things + # explicit, but they actually point to the same object + # as self.spect_buffer_r, self.spect_buffer_t, + # for economy of memory) + numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t, + self.spect_buffer_p, self.spect_buffer_m ) # Perform the inverse DHT (along axis -1, which corresponds to r) self.dhtp.transform( self.spect_buffer_p, spect_array_p ) diff --git a/fbpic/fields/spectral_transform/threading_methods.py b/fbpic/fields/spectral_transform/threading_methods.py new file mode 100644 index 000000000..ce16dbeef --- /dev/null +++ b/fbpic/fields/spectral_transform/threading_methods.py @@ -0,0 +1,58 @@ +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines a set of functions that are useful when converting the +fields from interpolation grid to the spectral grid and vice-versa +""" +from fbpic.threading_utils import prange, njit_parallel + +# ---------------------------------------------------- +# Functions that combine components in spectral space +# ---------------------------------------------------- + +@njit_parallel +def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) : + """ + Combine the arrays buffer_r and buffer_t to produce the + arrays buffer_p and buffer_m, according to the rules of + the Fourier-Hankel decomposition (see associated paper) + """ + Nz, Nr = buffer_r.shape + + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): + for ir in range(Nr): + + # Use intermediate variables, as the arrays + # buffer_r and buffer_t may actually point to the same + # object as buffer_p and buffer_m, for economy of memory + value_r = buffer_r[iz, ir] + value_t = buffer_t[iz, ir] + # Combine the values + buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t ) + buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t ) + + +@njit_parallel +def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) : + """ + Combine the arrays buffer_p and buffer_m to produce the + arrays buffer_r and buffer_t, according to the rules of + the Fourier-Hankel decomposition (see associated paper) + """ + Nz, Nr = buffer_p.shape + + # Loop over the 2D grid (parallel in z, if threading is installed) + for iz in prange(Nz): + for ir in range(Nr): + + # Use intermediate variables, as the arrays + # buffer_r and buffer_t may actually point to the same + # object as buffer_p and buffer_m, for economy of memory + value_p = buffer_p[iz, ir] + value_m = buffer_m[iz, ir] + # Combine the values + buffer_r[iz, ir] = ( value_p + value_m ) + buffer_t[iz, ir] = 1.j*( value_p - value_m ) From 5d2c7e5de971be4ffd737b8c7320ec7f63f2e7c4 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 20 Jul 2017 07:54:26 -0700 Subject: [PATCH 27/36] Remove the flag `use_threading` as an input argument --- fbpic/main.py | 24 ++++++++--------------- fbpic/particles/particles.py | 16 ++++----------- fbpic/threading_utils.py | 38 +++++++++++++++++++++++++++--------- 3 files changed, 41 insertions(+), 37 deletions(-) diff --git a/fbpic/main.py b/fbpic/main.py index 068491ffc..2266d14ca 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -12,7 +12,7 @@ from mpi4py import MPI import numba # Check if threading is available -from .threading_utils import threading_installed +from .threading_utils import threading_enabled # Check if CUDA is available, then import CUDA functions from .cuda_utils import cuda_installed if cuda_installed: @@ -45,7 +45,7 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax, p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0., n_order=-1, dens_func=None, filter_currents=True, v_comoving=None, use_galilean=False, initialize_ions=False, - use_cuda=False, use_threading=True, nthreads=None, + use_cuda=False, nthreads=None, n_guard=None, n_damp=30, exchange_period=None, boundaries='periodic', gamma_boost=None, use_all_mpi_ranks=True, particle_shape='linear' ): @@ -134,12 +134,10 @@ def dens_func( z, r ) ... use_cuda: bool, optional Wether to use CUDA (GPU) acceleration - use_threading : bool, optional - Wether to use multi-threading on the CPU. nthreads: int, optional - Number of CPU multi-threading threads used (if use_threading - is set). If nthreads is set to None, the number of threads - are automatically determined. + Number of CPU multi-threading threads used (if threading is + enabled) If nthreads is set to None, the number of threads + is automatically determined. n_guard: int, optional Number of guard cells to use at the left and right of @@ -201,11 +199,7 @@ def dens_func( z, r ) ... print('*** Performing the simulation on CPU.') self.use_cuda = False # CPU multi-threading - self.use_threading = use_threading - if (use_threading) and (threading_installed==False): - print('*** Threading not available for the simulation.') - print('*** (Please make sure that numba>0.34 is installed)') - self.use_threading = False + self.use_threading = threading_enabled if self.use_threading: # Define number of threads used if nthreads is not None: @@ -260,16 +254,14 @@ def dens_func( z, r ) ... zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m, grid_shape=grid_shape, particle_shape=particle_shape, - use_cuda=self.use_cuda, - use_threading=self.use_threading) ] + use_cuda=self.use_cuda ) ] if initialize_ions : self.ptcl.append( Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin, zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax, Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m, grid_shape=grid_shape, particle_shape=particle_shape, - use_cuda=self.use_cuda, - use_threading=self.use_threading) ) + use_cuda=self.use_cuda ) ) # Register the number of particles per cell along z, and dt # (Necessary for the moving window) diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 136a79b33..8474d2dc2 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -19,8 +19,8 @@ from .gathering.numba_methods import gather_field_numba # Check if threading is available, then import threaded functions -from fbpic.threading_utils import threading_installed -if threading_installed: +from fbpic.threading_utils import threading_enabled +if threading_enabled: from .deposition.threading_methods import deposit_rho_prange_linear, \ deposit_J_prange_linear, deposit_rho_prange_cubic, \ deposit_J_prange_cubic, sum_reduce_2d_array @@ -60,7 +60,7 @@ def __init__(self, q, m, n, Npz, zmin, zmax, ux_th=0., uy_th=0., uz_th=0., dens_func=None, continuous_injection=True, grid_shape=None, particle_shape='linear', - use_cuda=False, use_threading=True) : + use_cuda=False ) : """ Initialize a uniform set of particles @@ -123,9 +123,6 @@ def dens_func( z, r ) ... use_cuda : bool, optional Wether to use the GPU or not. - - use_threading : bool, optional - Wether to use multi-threading on the CPU. """ # Register the timestep self.dt = dt @@ -136,12 +133,6 @@ def dens_func( z, r ) ... print('*** Cuda not available for the particles.') print('*** Performing the particle operations on the CPU.') self.use_cuda = False - # Define whether or not to use threading - self.use_threading = use_threading - if (self.use_threading==True) and (threading_installed==False) : - print('*** Threading not available for the simulation.') - print('*** (Please make sure that numba>0.34 is installed)') - self.use_threading = False # Register the properties of the particles # (Necessary for the pusher, and when adding more particles later, ) @@ -234,6 +225,7 @@ def dens_func( z, r ) ... # Register boolean that records if the particles are sorted or not self.sorted = False # Register variables when using multithreading + self.use_threading = threading_enabled if self.use_threading == True: # Register number of threads self.nthreads = numba.config.NUMBA_NUM_THREADS diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py index 28bf0e28b..e5bb23b0a 100644 --- a/fbpic/threading_utils.py +++ b/fbpic/threading_utils.py @@ -5,15 +5,35 @@ This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) It defines a set of generic functions for multithreaded CPU execution. """ +import os from numba import njit -try: - # Try to import the threading function prange - from numba import prange - threading_installed = True - njit_parallel = njit( parallel=True ) -except ImportError: - # If not replace threading functions by single-thread functions - prange = range - threading_installed = False +# By default threading is enabled +threading_enabled = True + +# Check if the environment variable FBPIC_DISABLE_THREADING is set to 1 +# and in that case, disable threading +if 'FBPIC_DISABLE_THREADING' in os.environ: + if int(os.environ['FBPIC_DISABLE_THREADING']) == 1: + threading_enabled = False + +# If the user request threading (by not setting FBPIC_DISABLE_THREADING) +# check if it is indeed installed +if threading_enabled: + try: + # Try to import the threading function prange + from numba import prange + except ImportError: + threading_enabled = False + print('*** Threading not available for the simulation.') + print('*** (Please make sure that numba>0.34 is installed)') + +# Set the function njit_parallel and prange to the correct object +if not threading_enabled: + # Use regular serial compilation function njit_parallel = njit + prange = range +else: + # Use the parallel compilation function + njit_parallel = njit( parallel=True ) + from numba import prange From 62e37c5162cfef84774ae6a36ba854a0f4c63de8 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 20 Jul 2017 08:24:38 -0700 Subject: [PATCH 28/36] Correct pyflakes errors --- fbpic/threading_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py index e5bb23b0a..4a1200605 100644 --- a/fbpic/threading_utils.py +++ b/fbpic/threading_utils.py @@ -22,7 +22,7 @@ if threading_enabled: try: # Try to import the threading function prange - from numba import prange + import numba.prange except ImportError: threading_enabled = False print('*** Threading not available for the simulation.') @@ -36,4 +36,4 @@ else: # Use the parallel compilation function njit_parallel = njit( parallel=True ) - from numba import prange + prange = numba.prange From 076b6686b856cc39ef7837a7dec78814ec4075e7 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 20 Jul 2017 08:19:32 -0700 Subject: [PATCH 29/36] Thread the shifting of the grid in spectral space --- fbpic/boundaries/moving_window.py | 127 +++++++++++++----------------- 1 file changed, 55 insertions(+), 72 deletions(-) diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py index 33d2d921e..8080ca769 100644 --- a/fbpic/boundaries/moving_window.py +++ b/fbpic/boundaries/moving_window.py @@ -318,86 +318,36 @@ def shift_spect_grid( self, grid, n_move, """ if grid.use_cuda: shift = grid.d_field_shift + # Get a 2D CUDA grid of the size of the grid + tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] ) # Shift all the fields on the GPU - self.shift_spect_field_gpu( grid.Ep, shift, n_move ) - self.shift_spect_field_gpu( grid.Em, shift, n_move ) - self.shift_spect_field_gpu( grid.Ez, shift, n_move ) - self.shift_spect_field_gpu( grid.Bp, shift, n_move ) - self.shift_spect_field_gpu( grid.Bm, shift, n_move ) - self.shift_spect_field_gpu( grid.Bz, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move ) if shift_rho: - self.shift_spect_field_gpu( grid.rho_prev, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move ) if shift_currents: - self.shift_spect_field_gpu( grid.Jp, shift, n_move ) - self.shift_spect_field_gpu( grid.Jm, shift, n_move ) - self.shift_spect_field_gpu( grid.Jz, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move ) + shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move ) else: shift = grid.field_shift # Shift all the fields on the CPU - self.shift_spect_field( grid.Ep, shift, n_move ) - self.shift_spect_field( grid.Em, shift, n_move ) - self.shift_spect_field( grid.Ez, shift, n_move ) - self.shift_spect_field( grid.Bp, shift, n_move ) - self.shift_spect_field( grid.Bm, shift, n_move ) - self.shift_spect_field( grid.Bz, shift, n_move ) + shift_spect_array_cpu( grid.Ep, shift, n_move ) + shift_spect_array_cpu( grid.Em, shift, n_move ) + shift_spect_array_cpu( grid.Ez, shift, n_move ) + shift_spect_array_cpu( grid.Bp, shift, n_move ) + shift_spect_array_cpu( grid.Bm, shift, n_move ) + shift_spect_array_cpu( grid.Bz, shift, n_move ) if shift_rho: - self.shift_spect_field( grid.rho_prev, shift, n_move ) + shift_spect_array_cpu( grid.rho_prev, shift, n_move ) if shift_currents: - self.shift_spect_field( grid.Jp, shift, n_move ) - self.shift_spect_field( grid.Jm, shift, n_move ) - self.shift_spect_field( grid.Jz, shift, n_move ) - - def shift_spect_field( self, field_array, shift_factor, n_move ): - """ - Shift the field 'field_array' by n_move cells. - This is done in spectral space and corresponds to multiplying the - fields with the factor exp(i*kz_true*dz)**n_move . - (Typically n_move is positive, and the fields are shifted backwards) - - Parameters - ---------- - field_array: 2darray of complexs - Contains the value of the fields, and is modified by - this function - - shift_factor: 1darray of complexs - Contains the shift array, that is multiplied to the fields in - spectral space to shift them by one cell in spatial space - ( exp(i*kz_true*dz) ) - - n_move: int - The number of cells by which the grid should be shifted - """ - # Multiply with (shift_factor*sign(n_move))**n_move - field_array *= ( shift_factor[:, np.newaxis] )**n_move - - def shift_spect_field_gpu( self, field_array, shift_factor, n_move): - """ - Shift the field 'field_array' by n_move cells on the GPU. - This is done in spectral space and corresponds to multiplying the - fields with the factor exp(i*kz_true*dz)**n_move . - (Typically n_move is positive, and the fields are shifted backwards) - - Parameters - ---------- - field_array: 2darray of complexs - Contains the value of the fields, and is modified by - this function - - shift_factor: 1darray of complexs - Contains the shift array, that is multiplied to the fields in - spectral space to shift them by one cell in spatial space - ( exp(i*kz_true*dz) ) - - n_move: int - The number of cells by which the grid should be shifted - """ - # Get a 2D CUDA grid of the size of the grid - dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d( - field_array.shape[0], field_array.shape[1] ) - # Shift the field array in place - shift_spect_array_gpu[dim_grid_2d, dim_block_2d]( - field_array, shift_factor, n_move) + shift_spect_array_cpu( grid.Jp, shift, n_move ) + shift_spect_array_cpu( grid.Jm, shift, n_move ) + shift_spect_array_cpu( grid.Jz, shift, n_move ) def shift_interp_grid( self, grid, n_move, shift_rho=True, shift_currents=False ): @@ -513,6 +463,39 @@ def shift_interp_field_gpu( self, field_array, n_move): # Return the new shifted field array return( field_array ) +@njit_parallel +def shift_spect_array_cpu( field_array, shift_factor, n_move ): + """ + Shift the field 'field_array' by n_move cells on CPU. + This is done in spectral space and corresponds to multiplying the + fields with the factor exp(i*kz_true*dz)**n_move . + + Parameters + ---------- + field_array: 2darray of complexs + Contains the value of the fields, and is modified by + this function + + shift_factor: 1darray of complexs + Contains the shift array, that is multiplied to the fields in + spectral space to shift them by one cell in spatial space + ( exp(i*kz_true*dz) ) + + n_move: int + The number of cells by which the grid should be shifted + """ + # Get a 2D CUDA grid + iz, ir = cuda.grid(2) + + # Only access values that are actually in the array + if ir < field_array.shape[1] and iz < field_array.shape[0]: + # Calculate the shift factor (raising to the power n_move) + power_shift = shift_factor[iz] + for i in range(1,n_move): + power_shift *= shift_factor[iz] + # Shift fields backwards + field_array[iz, ir] *= power_shift + if cuda_installed: @cuda.jit('void(complex128[:,:], complex128[:,:], int32)') From 0e654a18ba6d9c6a7a1d23157d18fe16288e8bb0 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 20 Jul 2017 08:41:23 -0700 Subject: [PATCH 30/36] Fix the threaded shift function --- fbpic/boundaries/moving_window.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py index 8080ca769..f08775b79 100644 --- a/fbpic/boundaries/moving_window.py +++ b/fbpic/boundaries/moving_window.py @@ -9,6 +9,7 @@ from scipy.constants import c from fbpic.particles import Particles from fbpic.lpa_utils.boosted_frame import BoostConverter +from fbpic.threading_utils import njit_parallel, prange # Check if CUDA is available, then import CUDA functions from fbpic.cuda_utils import cuda_installed if cuda_installed: @@ -484,17 +485,17 @@ def shift_spect_array_cpu( field_array, shift_factor, n_move ): n_move: int The number of cells by which the grid should be shifted """ - # Get a 2D CUDA grid - iz, ir = cuda.grid(2) + Nz, Nr = field_array.shape - # Only access values that are actually in the array - if ir < field_array.shape[1] and iz < field_array.shape[0]: - # Calculate the shift factor (raising to the power n_move) + # Loop over the 2D array (in parallel over z if threading is enabled) + for iz in prange( Nz ): power_shift = shift_factor[iz] + # Calculate the shift factor (raising to the power n_move) for i in range(1,n_move): power_shift *= shift_factor[iz] # Shift fields backwards - field_array[iz, ir] *= power_shift + for ir in range( Nr ): + field_array[iz, ir] *= power_shift if cuda_installed: From 11ce49d1fd8bf06459977f30346a6f960167574c Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 20 Jul 2017 09:51:50 -0700 Subject: [PATCH 31/36] Remove arguments nthreads --- fbpic/main.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/fbpic/main.py b/fbpic/main.py index 2266d14ca..647d9c287 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -45,8 +45,7 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax, p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0., n_order=-1, dens_func=None, filter_currents=True, v_comoving=None, use_galilean=False, initialize_ions=False, - use_cuda=False, nthreads=None, - n_guard=None, n_damp=30, exchange_period=None, + use_cuda=False, n_guard=None, n_damp=30, exchange_period=None, boundaries='periodic', gamma_boost=None, use_all_mpi_ranks=True, particle_shape='linear' ): """ @@ -134,10 +133,6 @@ def dens_func( z, r ) ... use_cuda: bool, optional Wether to use CUDA (GPU) acceleration - nthreads: int, optional - Number of CPU multi-threading threads used (if threading is - enabled) If nthreads is set to None, the number of threads - is automatically determined. n_guard: int, optional Number of guard cells to use at the left and right of @@ -200,15 +195,7 @@ def dens_func( z, r ) ... self.use_cuda = False # CPU multi-threading self.use_threading = threading_enabled - if self.use_threading: - # Define number of threads used - if nthreads is not None: - # Automatically take numba preset for number of threads - self.nthreads = nthreads - numba.config.NUMBA_NUM_THREADS = self.nthreads - else: - # Set user-defined number of threads - self.nthreads = numba.config.NUMBA_NUM_THREADS + # Register the comoving parameters self.v_comoving = v_comoving self.use_galilean = use_galilean From 77be6cf95eca587d3593114e9721b6aa5862bc81 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 20 Jul 2017 14:16:41 -0700 Subject: [PATCH 32/36] Modified import structure of the prange function --- fbpic/threading_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py index 4a1200605..a35d9a8c5 100644 --- a/fbpic/threading_utils.py +++ b/fbpic/threading_utils.py @@ -22,7 +22,7 @@ if threading_enabled: try: # Try to import the threading function prange - import numba.prange + from numba import prange as numba_prange except ImportError: threading_enabled = False print('*** Threading not available for the simulation.') @@ -36,4 +36,4 @@ else: # Use the parallel compilation function njit_parallel = njit( parallel=True ) - prange = numba.prange + prange = numba_prange From 777281990f318a86e49b9273b34d20b6a4f7c2a0 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 21 Jul 2017 21:09:54 -0700 Subject: [PATCH 33/36] Replace line endings to unix style --- .../particles/deposition/threading_methods.py | 3086 ++++++++--------- .../particles/gathering/threading_methods.py | 1202 +++---- 2 files changed, 2144 insertions(+), 2144 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 11ecb54c8..3aa895087 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -1,1543 +1,1543 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the deposition methods for rho and J for linear and cubic -order shapes on the CPU with threading. -""" -import numba -from numba import prange, int64 -import math -from scipy.constants import c - -# ------------------------------- -# Particle shape Factor functions -# ------------------------------- - -# Linear shapes -@numba.njit -def z_shape_linear(cell_position, index): - iz = int64(math.floor(cell_position)) - if index == 0: - return iz+1.-cell_position - if index == 1: - return cell_position - iz - -@numba.njit -def r_shape_linear(cell_position, index): - flip_factor = 1. - ir = int64(math.floor(cell_position)) - if index == 0: - if ir < 0: - flip_factor = -1. - return flip_factor*(ir+1.-cell_position) - if index == 1: - return flip_factor*(cell_position - ir) - -# Cubic shapes -@numba.njit -def z_shape_cubic(cell_position, index): - iz = int64(math.floor(cell_position)) - 1 - if index == 0: - return (-1./6.)*((cell_position-iz)-2)**3 - if index == 1: - return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) - if index == 2: - return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) - if index == 3: - return (-1./6.)*(((iz+3)-cell_position)-2)**3 - -@numba.njit -def r_shape_cubic(cell_position, index): - flip_factor = 1. - ir = int64(math.floor(cell_position)) - 1 - if index == 0: - if ir < 0: - flip_factor = -1. - return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 - if index == 1: - if ir+1 < 0: - flip_factor = -1. - return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) - if index == 2: - if ir+2 < 0: - flip_factor = -1. - return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) - if index == 3: - if ir+3 < 0: - flip_factor = -1. - return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 - -# ------------------------------- -# Field deposition - linear - rho -# ------------------------------- - -@numba.njit(parallel=True) -def deposit_rho_prange_linear(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho_m0_global, rho_m1_global, - nthreads, tx_chunks): - """ - Deposition of the charge density rho using numba prange on the CPU. - Iterates over the threads in parallel, while each thread iterates - over a batch of particles. Intermediate results for each threads are - stored in copies of the global grid. At the end of the parallel loop, - the thread-local field arrays are combined (summed) to a global array. - (This final reduction is *not* done in this function) - - Calculates the weighted amount of rho that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The rest of the execution is similar to the CUDA equivalent function. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) - The global helper arrays to store the thread local charge densities - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - nthreads : int - Number of CPU threads used with numba prange - - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. - """ - # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): - # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0_scal = wj * exptheta_m0 - # Mode 1 - R_m1_scal = wj * exptheta_m1 - - # Original index of the uppper and lower cell - ir_cell = int(math.floor( r_cell )) - iz_cell = int(math.floor( z_cell )) - - # Treat the boundary conditions - # guard cells in lower r - if ir_cell < 0: - ir_cell = 0 - # absorbing in upper r - if ir_cell > Nr-1: - ir_cell = Nr-1 - # periodic boundaries in z - if iz_cell < 0: - iz_cell += Nz - if iz_cell > Nz-1: - iz_cell -= Nz - - # Boundary Region Shifts - ir_flip = int( math.floor(r_cell) ) - - # Declare local field array - R_m0_00 = 0. - R_m0_01 = 0. - R_m0_10 = 0. - R_m0_11 = 0. - - R_m1_00 = 0. + 0.j - R_m1_01 = 0. + 0.j - R_m1_10 = 0. + 0.j - R_m1_11 = 0. + 0.j - - R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal - - if ir_flip == -1: - R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal - else: - R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal - R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal - R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal - R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal - - # Cell shifts for the simulation boundaries - shift_r = 0 - shift_z = 0 - if ir_cell+1 > (Nr-1): - shift_r = -1 - if iz_cell+1 > Nz-1: - shift_z -= Nz - - # Write ptcl fields to thread-local part of global deposition array - rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00 - rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00 - - rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01 - rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01 - - rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10 - rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10 - - rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 - rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 - - return - -# ------------------------------- -# Field deposition - linear - J -# ------------------------------- - -@numba.njit(parallel=True) -def deposit_J_prange_linear(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - j_r_m0_global, j_r_m1_global, - j_t_m0_global, j_t_m1_global, - j_z_m0_global, j_z_m1_global, - nthreads, tx_chunks): - """ - Deposition of the current density J using numba prange on the CPU. - Iterates over the threads in parallel, while each thread iterates - over a batch of particles. Intermediate results for each threads are - stored in copies of the global grid. At the end of the parallel loop, - the thread-local field arrays are combined (summed) to the global array. - (This final reduction is *not* done in this function) - - Calculates the weighted amount of J that is deposited to the - 4 cells surounding the particle based on its shape (linear). - - The rest of the execution is similar to the CUDA equivalent function. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) - The global helper arrays to store the thread local current component - in each direction (r, t, z) on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - nthreads : int - Number of CPU threads used with numba prange - - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. - """ - # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): - # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 - J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 - J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 - # Mode 1 - J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 - J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 - J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 - - # Original index of the uppper and lower cell - ir_cell = int(math.floor( r_cell )) - iz_cell = int(math.floor( z_cell )) - - # Treat the boundary conditions - # guard cells in lower r - if ir_cell < 0: - ir_cell = 0 - # absorbing in upper r - if ir_cell > Nr-1: - ir_cell = Nr-1 - # periodic boundaries in z - if iz_cell < 0: - iz_cell += Nz - if iz_cell > Nz-1: - iz_cell -= Nz - - # Boundary Region Shifts - ir_flip = int( math.floor(r_cell) ) - - # Declare local field arrays - J_r_m0_00 = 0. - J_r_m1_00 = 0. + 0.j - J_t_m0_00 = 0. - J_t_m1_00 = 0. + 0.j - J_z_m0_00 = 0. - J_z_m1_00 = 0. + 0.j - - J_r_m0_01 = 0. - J_r_m1_01 = 0. + 0.j - J_t_m0_01 = 0. - J_t_m1_01 = 0. + 0.j - J_z_m0_01 = 0. - J_z_m1_01 = 0. + 0.j - - J_r_m0_10 = 0. - J_r_m1_10 = 0. + 0.j - J_t_m0_10 = 0. - J_t_m1_10 = 0. + 0.j - J_z_m0_10 = 0. - J_z_m1_10 = 0. + 0.j - - J_r_m0_11 = 0. - J_r_m1_11 = 0. + 0.j - J_t_m0_11 = 0. - J_t_m1_11 = 0. + 0.j - J_z_m0_11 = 0. - J_z_m1_11 = 0. + 0.j - - J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal - - # Take into account lower r flips - if ir_flip == -1: - J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal - else: - J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal - J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal - J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal - J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal - J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal - J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal - J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal - J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal - J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal - J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal - J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal - J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal - - # Cell shifts for the simulation boundaries - shift_r = 0 - shift_z = 0 - if (ir_cell+1) > (Nr-1): - shift_r = -1 - if (iz_cell+1) > Nz-1: - shift_z -= Nz - - # Write ptcl fields to thread-local part of global deposition array - j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00 - j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00 - - j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 - j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 - - j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 - j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 - - j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 - j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 - - j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00 - j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00 - - j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 - j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 - - j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 - j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 - - j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 - j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 - - j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00 - j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00 - - j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 - j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 - - j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 - j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 - - j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 - j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 - - return - - -# ------------------------------- -# Field deposition - cubic - rho -# ------------------------------- - -@numba.njit(parallel=True) -def deposit_rho_prange_cubic(x, y, z, w, - invdz, zmin, Nz, - invdr, rmin, Nr, - rho_m0_global, rho_m1_global, - nthreads, tx_chunks): - """ - - Deposition of the charge density rho using numba prange on the CPU. - Iterates over the threads in parallel, while each thread iterates - over a batch of particles. Intermediate results for each threads are - stored in copies of the global grid. At the end of the parallel loop, - the thread-local field arrays are combined (summed) to the global array. - (This final reduction is *not* done in this function) - - Calculates the weighted amount of rho that is deposited to the - 16 cells surounding the particle based on its shape (cubic). - - The rest of the execution is similar to the CUDA equivalent function. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) - The global helper arrays to store the thread local charge densities - on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the considered direction - - Nz, Nr : int - Number of gridpoints along the considered direction - - nthreads : int - Number of CPU threads used with numba prange - - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. - """ - # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): - # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate rho - # -------------------------------------------- - # Mode 0 - R_m0_scal = wj * exptheta_m0 - # Mode 1 - R_m1_scal = wj * exptheta_m1 - - # Original index of the uppper and lower cell - ir_cell = int(math.floor( r_cell )) - iz_cell = int(math.floor( z_cell )) - - # Treat the boundary conditions - # guard cells in lower r - if ir_cell < 0: - ir_cell = 0 - # absorbing in upper r - if ir_cell > Nr-1: - ir_cell = Nr-1 - # periodic boundaries in z - if iz_cell < 0: - iz_cell += Nz - if iz_cell > Nz-1: - iz_cell -= Nz - - # Compute values in local copies and consider boundaries - ir_flip = int( math.floor(r_cell) ) - 1 - - # Declare the local field value for - # all possible deposition directions, - # depending on the shape order and per mode. - R_m0_00 = 0. - R_m1_00 = 0. + 0.j - - R_m0_01 = 0. - R_m1_01 = 0. + 0.j - - R_m0_02 = 0. - R_m1_02 = 0. + 0.j - - R_m0_03 = 0. - R_m1_03 = 0. + 0.j - - R_m0_10 = 0. - R_m1_10 = 0. + 0.j - - R_m0_11 = 0. - R_m1_11 = 0. + 0.j - - R_m0_12 = 0. - R_m1_12 = 0. + 0.j - - R_m0_13 = 0. - R_m1_13 = 0. + 0.j - - R_m0_20 = 0. - R_m1_20 = 0. + 0.j - - R_m0_21 = 0. - R_m1_21 = 0. + 0.j - - R_m0_22 = 0. - R_m1_22 = 0. + 0.j - - R_m0_23 = 0. - R_m1_23 = 0. + 0.j - - R_m0_30 = 0. - R_m1_30 = 0. + 0.j - - R_m0_31 = 0. - R_m1_31 = 0. + 0.j - - R_m0_32 = 0. - R_m1_32 = 0. + 0.j - - R_m0_33 = 0. - R_m1_33 = 0. + 0.j - - if (ir_flip == -2): - R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal - - if (ir_flip == -1): - R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal - if (ir_flip >= 0): - R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal - - R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal - R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal - R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal - R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal - R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal - R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal - R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal - R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal - - # Index Shifting since local copies are centered around - # the current cell - srl = 0 # shift r lower - sru = 0 # shift r upper inner - sru2 = 0 # shift r upper outer - szl = 0 # shift z lower - szu = 0 # shift z upper inner - szu2 = 0 # shift z upper outer - if (iz_cell-1) < 0: - szl += Nz - if (iz_cell) == (Nz - 1): - szu -= Nz - szu2 -= Nz - if (iz_cell+1) == (Nz - 1): - szu2 -= Nz - if (ir_cell) >= (Nr - 1): - sru = -1 - sru2 = -2 - if (ir_cell+1) == (Nr - 1): - sru2 = -1 - if (ir_cell-1) < 0: - srl = 1 - - # Write ptcl fields to thread-local part of global deposition array - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 - rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01 - rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10 - rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11 - rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13 - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 - rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21 - rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 - rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31 - rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 - - return - -# ------------------------------- -# Field deposition - cubic - J -# ------------------------------- - -@numba.njit(parallel=True) -def deposit_J_prange_cubic(x, y, z, w, - ux, uy, uz, inv_gamma, - invdz, zmin, Nz, - invdr, rmin, Nr, - j_r_m0_global, j_r_m1_global, - j_t_m0_global, j_t_m1_global, - j_z_m0_global, j_z_m1_global, - nthreads, tx_chunks): - """ - Deposition of the current density J using numba prange on the CPU. - Iterates over the threads in parallel, while each thread iterates - over a batch of particles. Intermediate results for each threads are - stored in copies of the global grid. At the end of the parallel loop, - the thread-local field arrays are combined (summed) to the global array. - (This final reduction is *not* done in this function) - - Calculates the weighted amount of J that is deposited to the - 16 cells surounding the particle based on its shape (cubic). - - The rest of the execution is similar to the CUDA equivalent function. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - w : 1d array of floats - The weights of the particles - - ux, uy, uz : 1darray of floats (in meters * second^-1) - The velocity of the particles - - inv_gamma : 1darray of floats - The inverse of the relativistic gamma factor - - j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) - The global helper arrays to store the thread local current component - in each direction (r, t, z) on the interpolation grid for mode 0 and 1. - (is modified by this function) - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box, - along the direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - nthreads : int - Number of CPU threads used with numba prange - - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. - """ - # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): - # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] - # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] - # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] - # Weights - wj = w[ptcl_idx] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - # Avoid division by 0. - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos + 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the currents - # -------------------------------------------- - # Mode 0 - J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 - J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 - J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 - # Mode 1 - J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 - J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 - J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 - - # Original index of the uppper and lower cell - ir_cell = int(math.floor( r_cell )) - iz_cell = int(math.floor( z_cell )) - - # Treat the boundary conditions - # guard cells in lower r - if ir_cell < 0: - ir_cell = 0 - # absorbing in upper r - if ir_cell > Nr-1: - ir_cell = Nr-1 - # periodic boundaries in z - if iz_cell < 0: - iz_cell += Nz - if iz_cell > Nz-1: - iz_cell -= Nz - - # Compute values in local copies and consider boundaries - ir_flip = int64(math.floor(r_cell)) - 1 - - # Declare the local field value for - # all possible deposition directions, - # depending on the shape order and per mode for r,t and z. - J_r_m0_00 = 0. - J_t_m0_00 = 0. - J_z_m0_00 = 0. - J_r_m1_00 = 0. + 0.j - J_t_m1_00 = 0. + 0.j - J_z_m1_00 = 0. + 0.j - - J_r_m0_01 = 0. - J_t_m0_01 = 0. - J_z_m0_01 = 0. - J_r_m1_01 = 0. + 0.j - J_t_m1_01 = 0. + 0.j - J_z_m1_01 = 0. + 0.j - - J_r_m0_02 = 0. - J_t_m0_02 = 0. - J_z_m0_02 = 0. - J_r_m1_02 = 0. + 0.j - J_t_m1_02 = 0. + 0.j - J_z_m1_02 = 0. + 0.j - - J_r_m0_03 = 0. - J_t_m0_03 = 0. - J_z_m0_03 = 0. - J_r_m1_03 = 0. + 0.j - J_t_m1_03 = 0. + 0.j - J_z_m1_03 = 0. + 0.j - - J_r_m0_10 = 0. - J_t_m0_10 = 0. - J_z_m0_10 = 0. - J_r_m1_10 = 0. + 0.j - J_t_m1_10 = 0. + 0.j - J_z_m1_10 = 0. + 0.j - - J_r_m0_11 = 0. - J_t_m0_11 = 0. - J_z_m0_11 = 0. - J_r_m1_11 = 0. + 0.j - J_t_m1_11 = 0. + 0.j - J_z_m1_11 = 0. + 0.j - - J_r_m0_12 = 0. - J_t_m0_12 = 0. - J_z_m0_12 = 0. - J_r_m1_12 = 0. + 0.j - J_t_m1_12 = 0. + 0.j - J_z_m1_12 = 0. + 0.j - - J_r_m0_13 = 0. - J_t_m0_13 = 0. - J_z_m0_13 = 0. - J_r_m1_13 = 0. + 0.j - J_t_m1_13 = 0. + 0.j - J_z_m1_13 = 0. + 0.j - - J_r_m0_20 = 0. - J_t_m0_20 = 0. - J_z_m0_20 = 0. - J_r_m1_20 = 0. + 0.j - J_t_m1_20 = 0. + 0.j - J_z_m1_20 = 0. + 0.j - - J_r_m0_21 = 0. - J_t_m0_21 = 0. - J_z_m0_21 = 0. - J_r_m1_21 = 0. + 0.j - J_t_m1_21 = 0. + 0.j - J_z_m1_21 = 0. + 0.j - - J_r_m0_22 = 0. - J_t_m0_22 = 0. - J_z_m0_22 = 0. - J_r_m1_22 = 0. + 0.j - J_t_m1_22 = 0. + 0.j - J_z_m1_22 = 0. + 0.j - - J_r_m0_23 = 0. - J_t_m0_23 = 0. - J_z_m0_23 = 0. - J_r_m1_23 = 0. + 0.j - J_t_m1_23 = 0. + 0.j - J_z_m1_23 = 0. + 0.j - - J_r_m0_30 = 0. - J_t_m0_30 = 0. - J_z_m0_30 = 0. - J_r_m1_30 = 0. + 0.j - J_t_m1_30 = 0. + 0.j - J_z_m1_30 = 0. + 0.j - - J_r_m0_31 = 0. - J_t_m0_31 = 0. - J_z_m0_31 = 0. - J_r_m1_31 = 0. + 0.j - J_t_m1_31 = 0. + 0.j - J_z_m1_31 = 0. + 0.j - - J_r_m0_32 = 0. - J_t_m0_32 = 0. - J_z_m0_32 = 0. - J_r_m1_32 = 0. + 0.j - J_t_m1_32 = 0. + 0.j - J_z_m1_32 = 0. + 0.j - - J_r_m0_33 = 0. - J_t_m0_33 = 0. - J_z_m0_33 = 0. - J_r_m1_33 = 0. + 0.j - J_t_m1_33 = 0. + 0.j - J_z_m1_33 = 0. + 0.j - - if (ir_flip == -2): - J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - - J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - if (ir_flip == -1): - J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - if (ir_flip >= 0): - J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal - J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal - J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal - J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal - J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal - J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal - J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal - J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal - - J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal - J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal - J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal - J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal - J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal - J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal - J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal - J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal - - J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal - J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal - J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal - J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal - J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal - J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal - J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal - J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal - - # Index Shifting since local copies are centered around - # the current cell - srl = 0 # shift r lower - sru = 0 # shift r upper inner - sru2 = 0 # shift r upper outer - szl = 0 # shift z lower - szu = 0 # shift z upper inner - szu2 = 0 # shift z upper outer - if (iz_cell-1) < 0: - szl += Nz - if (iz_cell) == (Nz - 1): - szu -= Nz - szu2 -= Nz - if (iz_cell+1) == (Nz - 1): - szu2 -= Nz - if (ir_cell) >= (Nr - 1): - sru = -1 - sru2 = -2 - if (ir_cell+1) == (Nr - 1): - sru2 = -1 - if (ir_cell-1) < 0: - srl = 1 - - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 - j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01 - j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 - j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11 - j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 - j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21 - j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 - j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 - j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 - - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 - j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01 - j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 - j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11 - j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 - j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21 - j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 - j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 - j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 - - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 - j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01 - j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 - j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11 - j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 - j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21 - j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 - j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 - j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 - - return - -# ----------------------------------------------------------------------- -# Parallel reduction of the global arrays for threads into a single array -# ----------------------------------------------------------------------- - -@numba.njit( parallel=True ) -def sum_reduce_2d_array( global_array, reduced_array ): - """ - Sum the array `global_array` along its first axis and - add it into `reduced_array`. - - Parameters: - ----------- - global_array: 3darray of complexs - Field array whose first dimension corresponds to the - reduction dimension (typically: the number of threads used - during the current deposition) - - reduced array: 2darray of complexs - """ - # Extract size of each dimension - Nreduce, Nz, Nr = global_array.shape - - # Parallel loop over iz - for iz in prange( Nz ): - # Loop over the reduction dimension (slow dimension) - for it in range( Nreduce ): - # Loop over ir (fast dimension) - for ir in range( Nr ): - - reduced_array[ iz, ir ] += global_array[ it, iz, ir ] +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the deposition methods for rho and J for linear and cubic +order shapes on the CPU with threading. +""" +import numba +from numba import prange, int64 +import math +from scipy.constants import c + +# ------------------------------- +# Particle shape Factor functions +# ------------------------------- + +# Linear shapes +@numba.njit +def z_shape_linear(cell_position, index): + iz = int64(math.floor(cell_position)) + if index == 0: + return iz+1.-cell_position + if index == 1: + return cell_position - iz + +@numba.njit +def r_shape_linear(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(ir+1.-cell_position) + if index == 1: + return flip_factor*(cell_position - ir) + +# Cubic shapes +@numba.njit +def z_shape_cubic(cell_position, index): + iz = int64(math.floor(cell_position)) - 1 + if index == 0: + return (-1./6.)*((cell_position-iz)-2)**3 + if index == 1: + return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4) + if index == 2: + return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4) + if index == 3: + return (-1./6.)*(((iz+3)-cell_position)-2)**3 + +@numba.njit +def r_shape_cubic(cell_position, index): + flip_factor = 1. + ir = int64(math.floor(cell_position)) - 1 + if index == 0: + if ir < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*((cell_position-ir)-2)**3 + if index == 1: + if ir+1 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4) + if index == 2: + if ir+2 < 0: + flip_factor = -1. + return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4) + if index == 3: + if ir+3 < 0: + flip_factor = -1. + return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3 + +# ------------------------------- +# Field deposition - linear - rho +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_rho_prange_linear(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0_global, rho_m1_global, + nthreads, tx_chunks): + """ + Deposition of the charge density rho using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to a global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of rho that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local charge densities + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_chunks[0] + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Boundary Region Shifts + ir_flip = int( math.floor(r_cell) ) + + # Declare local field array + R_m0_00 = 0. + R_m0_01 = 0. + R_m0_10 = 0. + R_m0_11 = 0. + + R_m1_00 = 0. + 0.j + R_m1_01 = 0. + 0.j + R_m1_10 = 0. + 0.j + R_m1_11 = 0. + 0.j + + R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal + + if ir_flip == -1: + R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + else: + R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal + R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal + R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal + R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if ir_cell+1 > (Nr-1): + shift_r = -1 + if iz_cell+1 > Nz-1: + shift_z -= Nz + + # Write ptcl fields to thread-local part of global deposition array + rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00 + rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00 + + rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01 + rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01 + + rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10 + rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10 + + rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 + rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 + + return + +# ------------------------------- +# Field deposition - linear - J +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_J_prange_linear(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0_global, j_r_m1_global, + j_t_m0_global, j_t_m1_global, + j_z_m0_global, j_z_m1_global, + nthreads, tx_chunks): + """ + Deposition of the current density J using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of J that is deposited to the + 4 cells surounding the particle based on its shape (linear). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local current component + in each direction (r, t, z) on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_chunks[0] + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Boundary Region Shifts + ir_flip = int( math.floor(r_cell) ) + + # Declare local field arrays + J_r_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m0_00 = 0. + J_t_m1_00 = 0. + 0.j + J_z_m0_00 = 0. + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m0_01 = 0. + J_t_m1_01 = 0. + 0.j + J_z_m0_01 = 0. + J_z_m1_01 = 0. + 0.j + + J_r_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m0_10 = 0. + J_t_m1_10 = 0. + 0.j + J_z_m0_10 = 0. + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m0_11 = 0. + J_t_m1_11 = 0. + 0.j + J_z_m0_11 = 0. + J_z_m1_11 = 0. + 0.j + + J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Take into account lower r flips + if ir_flip == -1: + J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + else: + J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal + J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal + J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal + J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal + J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal + J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal + J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal + J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal + J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal + J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal + J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal + J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal + + # Cell shifts for the simulation boundaries + shift_r = 0 + shift_z = 0 + if (ir_cell+1) > (Nr-1): + shift_r = -1 + if (iz_cell+1) > Nz-1: + shift_z -= Nz + + # Write ptcl fields to thread-local part of global deposition array + j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00 + j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00 + + j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 + j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 + + j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 + j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 + + j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 + j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 + + j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00 + j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00 + + j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 + j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 + + j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 + j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 + + j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 + j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 + + j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00 + j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00 + + j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 + j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 + + j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 + j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 + + j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 + j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 + + return + + +# ------------------------------- +# Field deposition - cubic - rho +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_rho_prange_cubic(x, y, z, w, + invdz, zmin, Nz, + invdr, rmin, Nr, + rho_m0_global, rho_m1_global, + nthreads, tx_chunks): + """ + + Deposition of the charge density rho using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of rho that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local charge densities + on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the considered direction + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_chunks[0] + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate rho + # -------------------------------------------- + # Mode 0 + R_m0_scal = wj * exptheta_m0 + # Mode 1 + R_m1_scal = wj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Compute values in local copies and consider boundaries + ir_flip = int( math.floor(r_cell) ) - 1 + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode. + R_m0_00 = 0. + R_m1_00 = 0. + 0.j + + R_m0_01 = 0. + R_m1_01 = 0. + 0.j + + R_m0_02 = 0. + R_m1_02 = 0. + 0.j + + R_m0_03 = 0. + R_m1_03 = 0. + 0.j + + R_m0_10 = 0. + R_m1_10 = 0. + 0.j + + R_m0_11 = 0. + R_m1_11 = 0. + 0.j + + R_m0_12 = 0. + R_m1_12 = 0. + 0.j + + R_m0_13 = 0. + R_m1_13 = 0. + 0.j + + R_m0_20 = 0. + R_m1_20 = 0. + 0.j + + R_m0_21 = 0. + R_m1_21 = 0. + 0.j + + R_m0_22 = 0. + R_m1_22 = 0. + 0.j + + R_m0_23 = 0. + R_m1_23 = 0. + 0.j + + R_m0_30 = 0. + R_m1_30 = 0. + 0.j + + R_m0_31 = 0. + R_m1_31 = 0. + 0.j + + R_m0_32 = 0. + R_m1_32 = 0. + 0.j + + R_m0_33 = 0. + R_m1_33 = 0. + 0.j + + if (ir_flip == -2): + R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + if (ir_flip == -1): + R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + if (ir_flip >= 0): + R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal + + R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal + R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal + R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal + R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal + R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal + R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal + R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal + R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + # Write ptcl fields to thread-local part of global deposition array + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 + rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01 + rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10 + rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11 + rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 + rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21 + rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 + rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 + rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 + rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31 + rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31 + rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 + rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 + rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 + rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 + + return + +# ------------------------------- +# Field deposition - cubic - J +# ------------------------------- + +@numba.njit(parallel=True) +def deposit_J_prange_cubic(x, y, z, w, + ux, uy, uz, inv_gamma, + invdz, zmin, Nz, + invdr, rmin, Nr, + j_r_m0_global, j_r_m1_global, + j_t_m0_global, j_t_m1_global, + j_z_m0_global, j_z_m1_global, + nthreads, tx_chunks): + """ + Deposition of the current density J using numba prange on the CPU. + Iterates over the threads in parallel, while each thread iterates + over a batch of particles. Intermediate results for each threads are + stored in copies of the global grid. At the end of the parallel loop, + the thread-local field arrays are combined (summed) to the global array. + (This final reduction is *not* done in this function) + + Calculates the weighted amount of J that is deposited to the + 16 cells surounding the particle based on its shape (cubic). + + The rest of the execution is similar to the CUDA equivalent function. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + w : 1d array of floats + The weights of the particles + + ux, uy, uz : 1darray of floats (in meters * second^-1) + The velocity of the particles + + inv_gamma : 1darray of floats + The inverse of the relativistic gamma factor + + j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr) + The global helper arrays to store the thread local current component + in each direction (r, t, z) on the interpolation grid for mode 0 and 1. + (is modified by this function) + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box, + along the direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + nthreads : int + Number of CPU threads used with numba prange + + tx_chunks : list of int + Exact particle batch size per thread. + Last batch size can be greater if Ntot is not a multiple of nthreads. + """ + # Deposit the field per cell in parallel (for threads < number of cells) + for tx in prange( nthreads ): + # Loop over all particles in thread chunk + for idx in range( tx_chunks[tx] ): + # Calculate thread local particle index + ptcl_idx = idx + tx*tx_chunks[0] + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[ptcl_idx] + yj = y[ptcl_idx] + zj = z[ptcl_idx] + # Velocity + uxj = ux[ptcl_idx] + uyj = uy[ptcl_idx] + uzj = uz[ptcl_idx] + # Inverse gamma + inv_gammaj = inv_gamma[ptcl_idx] + # Weights + wj = w[ptcl_idx] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + # Avoid division by 0. + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos + 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the currents + # -------------------------------------------- + # Mode 0 + J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0 + J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0 + J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0 + # Mode 1 + J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1 + J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1 + J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1 + + # Original index of the uppper and lower cell + ir_cell = int(math.floor( r_cell )) + iz_cell = int(math.floor( z_cell )) + + # Treat the boundary conditions + # guard cells in lower r + if ir_cell < 0: + ir_cell = 0 + # absorbing in upper r + if ir_cell > Nr-1: + ir_cell = Nr-1 + # periodic boundaries in z + if iz_cell < 0: + iz_cell += Nz + if iz_cell > Nz-1: + iz_cell -= Nz + + # Compute values in local copies and consider boundaries + ir_flip = int64(math.floor(r_cell)) - 1 + + # Declare the local field value for + # all possible deposition directions, + # depending on the shape order and per mode for r,t and z. + J_r_m0_00 = 0. + J_t_m0_00 = 0. + J_z_m0_00 = 0. + J_r_m1_00 = 0. + 0.j + J_t_m1_00 = 0. + 0.j + J_z_m1_00 = 0. + 0.j + + J_r_m0_01 = 0. + J_t_m0_01 = 0. + J_z_m0_01 = 0. + J_r_m1_01 = 0. + 0.j + J_t_m1_01 = 0. + 0.j + J_z_m1_01 = 0. + 0.j + + J_r_m0_02 = 0. + J_t_m0_02 = 0. + J_z_m0_02 = 0. + J_r_m1_02 = 0. + 0.j + J_t_m1_02 = 0. + 0.j + J_z_m1_02 = 0. + 0.j + + J_r_m0_03 = 0. + J_t_m0_03 = 0. + J_z_m0_03 = 0. + J_r_m1_03 = 0. + 0.j + J_t_m1_03 = 0. + 0.j + J_z_m1_03 = 0. + 0.j + + J_r_m0_10 = 0. + J_t_m0_10 = 0. + J_z_m0_10 = 0. + J_r_m1_10 = 0. + 0.j + J_t_m1_10 = 0. + 0.j + J_z_m1_10 = 0. + 0.j + + J_r_m0_11 = 0. + J_t_m0_11 = 0. + J_z_m0_11 = 0. + J_r_m1_11 = 0. + 0.j + J_t_m1_11 = 0. + 0.j + J_z_m1_11 = 0. + 0.j + + J_r_m0_12 = 0. + J_t_m0_12 = 0. + J_z_m0_12 = 0. + J_r_m1_12 = 0. + 0.j + J_t_m1_12 = 0. + 0.j + J_z_m1_12 = 0. + 0.j + + J_r_m0_13 = 0. + J_t_m0_13 = 0. + J_z_m0_13 = 0. + J_r_m1_13 = 0. + 0.j + J_t_m1_13 = 0. + 0.j + J_z_m1_13 = 0. + 0.j + + J_r_m0_20 = 0. + J_t_m0_20 = 0. + J_z_m0_20 = 0. + J_r_m1_20 = 0. + 0.j + J_t_m1_20 = 0. + 0.j + J_z_m1_20 = 0. + 0.j + + J_r_m0_21 = 0. + J_t_m0_21 = 0. + J_z_m0_21 = 0. + J_r_m1_21 = 0. + 0.j + J_t_m1_21 = 0. + 0.j + J_z_m1_21 = 0. + 0.j + + J_r_m0_22 = 0. + J_t_m0_22 = 0. + J_z_m0_22 = 0. + J_r_m1_22 = 0. + 0.j + J_t_m1_22 = 0. + 0.j + J_z_m1_22 = 0. + 0.j + + J_r_m0_23 = 0. + J_t_m0_23 = 0. + J_z_m0_23 = 0. + J_r_m1_23 = 0. + 0.j + J_t_m1_23 = 0. + 0.j + J_z_m1_23 = 0. + 0.j + + J_r_m0_30 = 0. + J_t_m0_30 = 0. + J_z_m0_30 = 0. + J_r_m1_30 = 0. + 0.j + J_t_m1_30 = 0. + 0.j + J_z_m1_30 = 0. + 0.j + + J_r_m0_31 = 0. + J_t_m0_31 = 0. + J_z_m0_31 = 0. + J_r_m1_31 = 0. + 0.j + J_t_m1_31 = 0. + 0.j + J_z_m1_31 = 0. + 0.j + + J_r_m0_32 = 0. + J_t_m0_32 = 0. + J_z_m0_32 = 0. + J_r_m1_32 = 0. + 0.j + J_t_m1_32 = 0. + 0.j + J_z_m1_32 = 0. + 0.j + + J_r_m0_33 = 0. + J_t_m0_33 = 0. + J_z_m0_33 = 0. + J_r_m1_33 = 0. + 0.j + J_t_m1_33 = 0. + 0.j + J_z_m1_33 = 0. + 0.j + + if (ir_flip == -2): + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + + J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir_flip == -1): + J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + if (ir_flip >= 0): + J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal + J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal + J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal + J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal + J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal + J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal + J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal + J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal + + J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal + J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal + J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal + J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal + J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal + J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal + J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal + J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal + + J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal + J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal + J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal + J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal + J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal + J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal + J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal + J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal + + # Index Shifting since local copies are centered around + # the current cell + srl = 0 # shift r lower + sru = 0 # shift r upper inner + sru2 = 0 # shift r upper outer + szl = 0 # shift z lower + szu = 0 # shift z upper inner + szu2 = 0 # shift z upper outer + if (iz_cell-1) < 0: + szl += Nz + if (iz_cell) == (Nz - 1): + szu -= Nz + szu2 -= Nz + if (iz_cell+1) == (Nz - 1): + szu2 -= Nz + if (ir_cell) >= (Nr - 1): + sru = -1 + sru2 = -2 + if (ir_cell+1) == (Nr - 1): + sru2 = -1 + if (ir_cell-1) < 0: + srl = 1 + + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 + j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01 + j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 + j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11 + j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 + j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21 + j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 + j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 + j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 + j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 + j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 + j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 + j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 + j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 + j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 + + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 + j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01 + j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 + j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11 + j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 + j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21 + j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 + j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 + j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 + j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 + j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 + j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 + j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 + j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 + j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 + + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 + j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01 + j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 + j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11 + j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 + j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21 + j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 + j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 + j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 + j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 + j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 + j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 + j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 + j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 + j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 + + return + +# ----------------------------------------------------------------------- +# Parallel reduction of the global arrays for threads into a single array +# ----------------------------------------------------------------------- + +@numba.njit( parallel=True ) +def sum_reduce_2d_array( global_array, reduced_array ): + """ + Sum the array `global_array` along its first axis and + add it into `reduced_array`. + + Parameters: + ----------- + global_array: 3darray of complexs + Field array whose first dimension corresponds to the + reduction dimension (typically: the number of threads used + during the current deposition) + + reduced array: 2darray of complexs + """ + # Extract size of each dimension + Nreduce, Nz, Nr = global_array.shape + + # Parallel loop over iz + for iz in prange( Nz ): + # Loop over the reduction dimension (slow dimension) + for it in range( Nreduce ): + # Loop over ir (fast dimension) + for ir in range( Nr ): + + reduced_array[ iz, ir ] += global_array[ it, iz, ir ] diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py index 2c996a5ff..9809fcce5 100644 --- a/fbpic/particles/gathering/threading_methods.py +++ b/fbpic/particles/gathering/threading_methods.py @@ -1,601 +1,601 @@ -# Copyright 2016, FBPIC contributors -# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters -# License: 3-Clause-BSD-LBNL -""" -This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) -It defines the field gathering methods linear and cubic order shapes -on the CPU with threading -""" -from numba import prange, int64 -import numba -import math -import numpy as np - -# ----------------------- -# Field gathering linear -# ----------------------- - -@numba.njit(parallel=True) -def gather_field_prange_linear(x, y, z, - invdz, zmin, Nz, - invdr, rmin, Nr, - Er_m0, Et_m0, Ez_m0, - Er_m1, Et_m1, Ez_m1, - Br_m0, Bt_m0, Bz_m0, - Br_m1, Bt_m1, Bz_m1, - Ex, Ey, Ez, - Bx, By, Bz): - """ - Gathering of the fields (E and B) using numba with multi-threading. - Iterates over the particles, calculates the weighted amount - of fields acting on each particle based on its shape (linear). - Fields are gathered in cylindrical coordinates and then - transformed to cartesian coordinates. - Supports only mode 0 and 1. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box along the - direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - Er_m0, Et_m0, Ez_m0 : 2darray of complexs - The electric fields on the interpolation grid for the mode 0 - - Er_m1, Et_m1, Ez_m1 : 2darray of complexs - The electric fields on the interpolation grid for the mode 1 - - Br_m0, Bt_m0, Bz_m0 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 0 - - Br_m1, Bt_m1, Bz_m1 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 1 - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - (is modified by this function) - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - (is modified by this function) - """ - # Deposit the field per cell in parallel - for i in prange(x.shape[0]): - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[i] - yj = y[i] - zj = z[i] - - # Cylindrical conversion - rj = math.sqrt( xj**2 + yj**2 ) - if (rj !=0. ) : - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else : - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos - 1.j*sin - - # Get linear weights for the deposition - # -------------------------------------------- - # Positions of the particles, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - # Original index of the uppper and lower cell - ir_lower = int(math.floor( r_cell )) - ir_upper = ir_lower + 1 - iz_lower = int(math.floor( z_cell )) - iz_upper = iz_lower + 1 - # Linear weight - Sr_lower = ir_upper - r_cell - Sr_upper = r_cell - ir_lower - Sz_lower = iz_upper - z_cell - Sz_upper = z_cell - iz_lower - # Set guard weights to zero - Sr_guard = 0. - - # Treat the boundary conditions - # -------------------------------------------- - # guard cells in lower r - if ir_lower < 0: - Sr_guard = Sr_lower - Sr_lower = 0. - ir_lower = 0 - # absorbing in upper r - if ir_lower > Nr-1: - ir_lower = Nr-1 - if ir_upper > Nr-1: - ir_upper = Nr-1 - # periodic boundaries in z - # lower z boundaries - if iz_lower < 0: - iz_lower += Nz - if iz_upper < 0: - iz_upper += Nz - # upper z boundaries - if iz_lower > Nz-1: - iz_lower -= Nz - if iz_upper > Nz-1: - iz_upper -= Nz - - #Precalculate Shapes - S_ll = Sz_lower*Sr_lower - S_lu = Sz_lower*Sr_upper - S_ul = Sz_upper*Sr_lower - S_uu = Sz_upper*Sr_upper - S_lg = Sz_lower*Sr_guard - S_ug = Sz_upper*Sr_guard - - # E-Field - # ---------------------------- - # Define the initial placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ] - Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ] - Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ] - Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ] - Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ] - Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ] - Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ] - Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ] - Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ] - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0] - Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0] - Fz_m += 1. * S_lg * Ez_m0[ iz_lower, 0] - # Upper cell in z - Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0] - Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0] - Fz_m += 1. * S_ug * Ez_m0[ iz_upper, 0] - # Add the fields from the mode 0 - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 1 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ] - Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ] - Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ] - Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ] - Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ] - Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ] - Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ] - Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ] - Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ] - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += 1. * S_lg * Er_m1[ iz_lower, 0] - Ft_m += 1. * S_lg * Et_m1[ iz_lower, 0] - Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0] - # Upper cell in z - Fr_m += 1. * S_ug * Er_m1[ iz_upper, 0] - Ft_m += 1. * S_ug * Et_m1[ iz_upper, 0] - Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0] - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Ex[i] = cos*Fr - sin*Ft - Ey[i] = sin*Fr + cos*Ft - Ez[i] = Fz - - # B-Field - # ---------------------------- - # Clear the placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ] - Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ] - Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ] - Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ] - Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ] - Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ] - Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ] - Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ] - Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ] - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0] - Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0] - Fz_m += 1. * S_lg * Bz_m0[ iz_lower, 0] - # Upper cell in z - Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0] - Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0] - Fz_m += 1. * S_ug * Bz_m0[ iz_upper, 0] - # Add the fields from the mode 0 - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 1 - # Lower cell in z, Lower cell in r - Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ] - Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ] - Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ] - # Lower cell in z, Upper cell in r - Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ] - Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ] - Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ] - # Upper cell in z, Lower cell in r - Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ] - Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ] - Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ] - # Upper cell in z, Upper cell in r - Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ] - Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ] - Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ] - - # Add the fields from the guard cells - if ir_lower == ir_upper == 0: - # Lower cell in z - Fr_m += 1. * S_lg * Br_m1[ iz_lower, 0] - Ft_m += 1. * S_lg * Bt_m1[ iz_lower, 0] - Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0] - # Upper cell in z - Fr_m += 1. * S_ug * Br_m1[ iz_upper, 0] - Ft_m += 1. * S_ug * Bt_m1[ iz_upper, 0] - Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0] - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Bx[i] = cos*Fr - sin*Ft - By[i] = sin*Fr + cos*Ft - Bz[i] = Fz - - return Ex, Ey, Ez, Bx, By, Bz - -# ----------------------- -# Field gathering cubic -# ----------------------- - -@numba.njit(parallel=True) -def gather_field_prange_cubic(x, y, z, - invdz, zmin, Nz, - invdr, rmin, Nr, - Er_m0, Et_m0, Ez_m0, - Er_m1, Et_m1, Ez_m1, - Br_m0, Bt_m0, Bz_m0, - Br_m1, Bt_m1, Bz_m1, - Ex, Ey, Ez, - Bx, By, Bz): - """ - Gathering of the fields (E and B) using numba with multi-threading. - Iterates over the particles, calculates the weighted amount - of fields acting on each particle based on its shape (cubic). - Fields are gathered in cylindrical coordinates and then - transformed to cartesian coordinates. - Supports only mode 0 and 1. - - Parameters - ---------- - x, y, z : 1darray of floats (in meters) - The position of the particles - - invdz, invdr : float (in meters^-1) - Inverse of the grid step along the considered direction - - zmin, rmin : float (in meters) - Position of the edge of the simulation box along the - direction considered - - Nz, Nr : int - Number of gridpoints along the considered direction - - Er_m0, Et_m0, Ez_m0 : 2darray of complexs - The electric fields on the interpolation grid for the mode 0 - - Er_m1, Et_m1, Ez_m1 : 2darray of complexs - The electric fields on the interpolation grid for the mode 1 - - Br_m0, Bt_m0, Bz_m0 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 0 - - Br_m1, Bt_m1, Bz_m1 : 2darray of complexs - The magnetic fields on the interpolation grid for the mode 1 - - Ex, Ey, Ez : 1darray of floats - The electric fields acting on the particles - (is modified by this function) - - Bx, By, Bz : 1darray of floats - The magnetic fields acting on the particles - (is modified by this function) - """ - # Deposit the field per cell in parallel - # (for threads < number of particles) - for i in prange(x.shape[0]): - # Preliminary arrays for the cylindrical conversion - # -------------------------------------------- - # Position - xj = x[i] - yj = y[i] - zj = z[i] - - # Cylindrical conversion - rj = math.sqrt(xj**2 + yj**2) - if (rj != 0.): - invr = 1./rj - cos = xj*invr # Cosine - sin = yj*invr # Sine - else: - cos = 1. - sin = 0. - exptheta_m0 = 1. - exptheta_m1 = cos - 1.j*sin - - # Get weights for the deposition - # -------------------------------------------- - # Positions of the particle, in the cell unit - r_cell = invdr*(rj - rmin) - 0.5 - z_cell = invdz*(zj - zmin) - 0.5 - - # Calculate the shape factors - ir = np.zeros(4, dtype=int64) #[0,0,0,0] - Sr = np.zeros(4) #[0.,0.,0.,0.] - ir[0] = int64(math.floor(r_cell)) - 1 - ir[1] = ir[0] + 1 - ir[2] = ir[1] + 1 - ir[3] = ir[2] + 1 - Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3 - Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) - Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) - Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 - iz = np.zeros(4, dtype=int64) #[0,0,0,0] - Sz = np.zeros(4) #[0.,0.,0.,0.] - iz[0] = int64(math.floor(z_cell)) - 1 - iz[1] = iz[0] + 1 - iz[2] = iz[1] + 1 - iz[3] = iz[2] + 1 - Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3 - Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4) - Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) - Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 - # Lower and upper periodic boundary for z - index_z = 0 - while index_z < 4: - if iz[index_z] < 0: - iz[index_z] += Nz - if iz[index_z] > Nz - 1: - iz[index_z] -= Nz - index_z += 1 - # Lower and upper boundary for r - index_r = 0 - while index_r < 4: - if ir[index_r] < 0: - ir[index_r] = abs(ir[index_r])-1 - Sr[index_r] = (-1.)*Sr[index_r] - if ir[index_r] > Nr - 1: - ir[index_r] = Nr - 1 - index_r += 1 - - # E-Field - # ---------------------------- - # Define the initial placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - index_r = 0 - while index_r < 4: - index_z = 0 - while index_z < 4: - Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] - if Sz[index_z]*Sr[index_r] < 0: - Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Ez_m0[iz[index_z], ir[index_r]] - else: - Fz_m += Sz[index_z]*Sr[index_r]* \ - Ez_m0[iz[index_z], ir[index_r]] - index_z += 1 - index_r += 1 - - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 1 - index_r = 0 - while index_r < 4: - index_z = 0 - while index_z < 4: - if Sz[index_z]*Sr[index_r] < 0: - Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Er_m1[iz[index_z], ir[index_r]] - Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Et_m1[iz[index_z], ir[index_r]] - else: - Fr_m += Sz[index_z]*Sr[index_r]* \ - Er_m1[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]* \ - Et_m1[iz[index_z], ir[index_r]] - Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] - index_z += 1 - index_r += 1 - - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Ex[i] = (cos*Fr - sin*Ft) - Ey[i] = (sin*Fr + cos*Ft) - Ez[i] = Fz - - # B-Field - # ---------------------------- - # Clear the placeholders for the - # gathered field for each coordinate - Fr = 0. - Ft = 0. - Fz = 0. - - # Mode 0 - # ---------------------------- - # Create temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - # Add the fields for mode 0 - index_r = 0 - while index_r < 4: - index_z = 0 - while index_z < 4: - Fr_m += Sz[index_z]*Sr[index_r]* \ - Br_m0[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]* \ - Bt_m0[iz[index_z], ir[index_r]] - if Sz[index_z]*Sr[index_r] < 0: - Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Bz_m0[iz[index_z], ir[index_r]] - else: - Fz_m += Sz[index_z]*Sr[index_r]* \ - Bz_m0[iz[index_z], ir[index_r]] - index_z += 1 - index_r += 1 - - # Add the fields from the mode 0 - Fr += (Fr_m*exptheta_m0).real - Ft += (Ft_m*exptheta_m0).real - Fz += (Fz_m*exptheta_m0).real - - # Mode 1 - # ---------------------------- - # Clear the temporary variables - # for the "per mode" gathering - Fr_m = 0.j - Ft_m = 0.j - Fz_m = 0.j - - # Add the fields for mode 1 - index_r = 0 - while index_r < 4: - index_z = 0 - while index_z < 4: - if Sz[index_z]*Sr[index_r] < 0: - Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Br_m1[iz[index_z], ir[index_r]] - Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ - Bt_m1[iz[index_z], ir[index_r]] - else: - Fr_m += Sz[index_z]*Sr[index_r]* \ - Br_m1[iz[index_z], ir[index_r]] - Ft_m += Sz[index_z]*Sr[index_r]* \ - Bt_m1[iz[index_z], ir[index_r]] - Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] - index_z += 1 - index_r += 1 - - # Add the fields from the mode 1 - Fr += 2*(Fr_m*exptheta_m1).real - Ft += 2*(Ft_m*exptheta_m1).real - Fz += 2*(Fz_m*exptheta_m1).real - - # Convert to Cartesian coordinates - # and write to particle field arrays - Bx[i] = cos*Fr - sin*Ft - By[i] = sin*Fr + cos*Ft - Bz[i] = Fz - - return Ex, Ey, Ez, Bx, By, Bz \ No newline at end of file +# Copyright 2016, FBPIC contributors +# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters +# License: 3-Clause-BSD-LBNL +""" +This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC) +It defines the field gathering methods linear and cubic order shapes +on the CPU with threading +""" +from numba import prange, int64 +import numba +import math +import numpy as np + +# ----------------------- +# Field gathering linear +# ----------------------- + +@numba.njit(parallel=True) +def gather_field_prange_linear(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba with multi-threading. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (linear). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Deposit the field per cell in parallel + for i in prange(x.shape[0]): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt( xj**2 + yj**2 ) + if (rj !=0. ) : + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else : + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get linear weights for the deposition + # -------------------------------------------- + # Positions of the particles, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + # Original index of the uppper and lower cell + ir_lower = int(math.floor( r_cell )) + ir_upper = ir_lower + 1 + iz_lower = int(math.floor( z_cell )) + iz_upper = iz_lower + 1 + # Linear weight + Sr_lower = ir_upper - r_cell + Sr_upper = r_cell - ir_lower + Sz_lower = iz_upper - z_cell + Sz_upper = z_cell - iz_lower + # Set guard weights to zero + Sr_guard = 0. + + # Treat the boundary conditions + # -------------------------------------------- + # guard cells in lower r + if ir_lower < 0: + Sr_guard = Sr_lower + Sr_lower = 0. + ir_lower = 0 + # absorbing in upper r + if ir_lower > Nr-1: + ir_lower = Nr-1 + if ir_upper > Nr-1: + ir_upper = Nr-1 + # periodic boundaries in z + # lower z boundaries + if iz_lower < 0: + iz_lower += Nz + if iz_upper < 0: + iz_upper += Nz + # upper z boundaries + if iz_lower > Nz-1: + iz_lower -= Nz + if iz_upper > Nz-1: + iz_upper -= Nz + + #Precalculate Shapes + S_ll = Sz_lower*Sr_lower + S_lu = Sz_lower*Sr_upper + S_ul = Sz_upper*Sr_lower + S_uu = Sz_upper*Sr_upper + S_lg = Sz_lower*Sr_guard + S_ug = Sz_upper*Sr_guard + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Ez_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Ez_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Er_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Et_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Er_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Et_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = cos*Fr - sin*Ft + Ey[i] = sin*Fr + cos*Ft + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ] + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0] + Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0] + Fz_m += 1. * S_lg * Bz_m0[ iz_lower, 0] + # Upper cell in z + Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0] + Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0] + Fz_m += 1. * S_ug * Bz_m0[ iz_upper, 0] + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + # Lower cell in z, Lower cell in r + Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ] + Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ] + Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ] + # Lower cell in z, Upper cell in r + Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ] + Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ] + Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ] + # Upper cell in z, Lower cell in r + Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ] + Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ] + Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ] + # Upper cell in z, Upper cell in r + Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ] + Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ] + Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ] + + # Add the fields from the guard cells + if ir_lower == ir_upper == 0: + # Lower cell in z + Fr_m += 1. * S_lg * Br_m1[ iz_lower, 0] + Ft_m += 1. * S_lg * Bt_m1[ iz_lower, 0] + Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0] + # Upper cell in z + Fr_m += 1. * S_ug * Br_m1[ iz_upper, 0] + Ft_m += 1. * S_ug * Bt_m1[ iz_upper, 0] + Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0] + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + + return Ex, Ey, Ez, Bx, By, Bz + +# ----------------------- +# Field gathering cubic +# ----------------------- + +@numba.njit(parallel=True) +def gather_field_prange_cubic(x, y, z, + invdz, zmin, Nz, + invdr, rmin, Nr, + Er_m0, Et_m0, Ez_m0, + Er_m1, Et_m1, Ez_m1, + Br_m0, Bt_m0, Bz_m0, + Br_m1, Bt_m1, Bz_m1, + Ex, Ey, Ez, + Bx, By, Bz): + """ + Gathering of the fields (E and B) using numba with multi-threading. + Iterates over the particles, calculates the weighted amount + of fields acting on each particle based on its shape (cubic). + Fields are gathered in cylindrical coordinates and then + transformed to cartesian coordinates. + Supports only mode 0 and 1. + + Parameters + ---------- + x, y, z : 1darray of floats (in meters) + The position of the particles + + invdz, invdr : float (in meters^-1) + Inverse of the grid step along the considered direction + + zmin, rmin : float (in meters) + Position of the edge of the simulation box along the + direction considered + + Nz, Nr : int + Number of gridpoints along the considered direction + + Er_m0, Et_m0, Ez_m0 : 2darray of complexs + The electric fields on the interpolation grid for the mode 0 + + Er_m1, Et_m1, Ez_m1 : 2darray of complexs + The electric fields on the interpolation grid for the mode 1 + + Br_m0, Bt_m0, Bz_m0 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 0 + + Br_m1, Bt_m1, Bz_m1 : 2darray of complexs + The magnetic fields on the interpolation grid for the mode 1 + + Ex, Ey, Ez : 1darray of floats + The electric fields acting on the particles + (is modified by this function) + + Bx, By, Bz : 1darray of floats + The magnetic fields acting on the particles + (is modified by this function) + """ + # Deposit the field per cell in parallel + # (for threads < number of particles) + for i in prange(x.shape[0]): + # Preliminary arrays for the cylindrical conversion + # -------------------------------------------- + # Position + xj = x[i] + yj = y[i] + zj = z[i] + + # Cylindrical conversion + rj = math.sqrt(xj**2 + yj**2) + if (rj != 0.): + invr = 1./rj + cos = xj*invr # Cosine + sin = yj*invr # Sine + else: + cos = 1. + sin = 0. + exptheta_m0 = 1. + exptheta_m1 = cos - 1.j*sin + + # Get weights for the deposition + # -------------------------------------------- + # Positions of the particle, in the cell unit + r_cell = invdr*(rj - rmin) - 0.5 + z_cell = invdz*(zj - zmin) - 0.5 + + # Calculate the shape factors + ir = np.zeros(4, dtype=int64) #[0,0,0,0] + Sr = np.zeros(4) #[0.,0.,0.,0.] + ir[0] = int64(math.floor(r_cell)) - 1 + ir[1] = ir[0] + 1 + ir[2] = ir[1] + 1 + ir[3] = ir[2] + 1 + Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3 + Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4) + Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4) + Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3 + iz = np.zeros(4, dtype=int64) #[0,0,0,0] + Sz = np.zeros(4) #[0.,0.,0.,0.] + iz[0] = int64(math.floor(z_cell)) - 1 + iz[1] = iz[0] + 1 + iz[2] = iz[1] + 1 + iz[3] = iz[2] + 1 + Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3 + Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4) + Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4) + Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3 + # Lower and upper periodic boundary for z + index_z = 0 + while index_z < 4: + if iz[index_z] < 0: + iz[index_z] += Nz + if iz[index_z] > Nz - 1: + iz[index_z] -= Nz + index_z += 1 + # Lower and upper boundary for r + index_r = 0 + while index_r < 4: + if ir[index_r] < 0: + ir[index_r] = abs(ir[index_r])-1 + Sr[index_r] = (-1.)*Sr[index_r] + if ir[index_r] > Nr - 1: + ir[index_r] = Nr - 1 + index_r += 1 + + # E-Field + # ---------------------------- + # Define the initial placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Ez_m0[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 1 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Er_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Et_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Ex[i] = (cos*Fr - sin*Ft) + Ey[i] = (sin*Fr + cos*Ft) + Ez[i] = Fz + + # B-Field + # ---------------------------- + # Clear the placeholders for the + # gathered field for each coordinate + Fr = 0. + Ft = 0. + Fz = 0. + + # Mode 0 + # ---------------------------- + # Create temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + # Add the fields for mode 0 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m0[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m0[iz[index_z], ir[index_r]] + if Sz[index_z]*Sr[index_r] < 0: + Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + else: + Fz_m += Sz[index_z]*Sr[index_r]* \ + Bz_m0[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + # Add the fields from the mode 0 + Fr += (Fr_m*exptheta_m0).real + Ft += (Ft_m*exptheta_m0).real + Fz += (Fz_m*exptheta_m0).real + + # Mode 1 + # ---------------------------- + # Clear the temporary variables + # for the "per mode" gathering + Fr_m = 0.j + Ft_m = 0.j + Fz_m = 0.j + + # Add the fields for mode 1 + index_r = 0 + while index_r < 4: + index_z = 0 + while index_z < 4: + if Sz[index_z]*Sr[index_r] < 0: + Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + else: + Fr_m += Sz[index_z]*Sr[index_r]* \ + Br_m1[iz[index_z], ir[index_r]] + Ft_m += Sz[index_z]*Sr[index_r]* \ + Bt_m1[iz[index_z], ir[index_r]] + Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]] + index_z += 1 + index_r += 1 + + # Add the fields from the mode 1 + Fr += 2*(Fr_m*exptheta_m1).real + Ft += 2*(Ft_m*exptheta_m1).real + Fz += 2*(Fz_m*exptheta_m1).real + + # Convert to Cartesian coordinates + # and write to particle field arrays + Bx[i] = cos*Fr - sin*Ft + By[i] = sin*Fr + cos*Ft + Bz[i] = Fz + + return Ex, Ey, Ez, Bx, By, Bz From afd5d3b3aad1b04d9c0fceffd239926678662d74 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 21 Jul 2017 21:31:33 -0700 Subject: [PATCH 34/36] Replace tx_chunks by an array --- .../particles/deposition/threading_methods.py | 16 ++++---------- fbpic/particles/particles.py | 21 ++++++++++++------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 3aa895087..2c85e8236 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -125,9 +125,7 @@ def deposit_rho_prange_linear(x, y, z, w, # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] + for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position @@ -296,9 +294,7 @@ def deposit_J_prange_linear(x, y, z, w, # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] + for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position @@ -539,9 +535,7 @@ def deposit_rho_prange_cubic(x, y, z, w, # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] + for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position @@ -880,9 +874,7 @@ def deposit_J_prange_cubic(x, y, z, w, # Deposit the field per cell in parallel (for threads < number of cells) for tx in prange( nthreads ): # Loop over all particles in thread chunk - for idx in range( tx_chunks[tx] ): - # Calculate thread local particle index - ptcl_idx = idx + tx*tx_chunks[0] + for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py index 8474d2dc2..63e4a871c 100644 --- a/fbpic/particles/particles.py +++ b/fbpic/particles/particles.py @@ -743,10 +743,15 @@ def deposit( self, fld, fieldtype ) : 'rho', but is `%s`" % fieldtype) # CPU multi-threading version elif self.use_threading: - # Register particle chunk size for each thread - tx_N = int(self.Ntot/self.nthreads) - tx_chunks = [ tx_N for k in range(self.nthreads) ] - tx_chunks[-1] = tx_chunks[-1] + int(self.Ntot%self.nthreads) + + # Divide particles in chunks (each chunk is handled by a different + # thread) and register the indices that bound each chunks + n_avg_per_thread = int( self.Ntot/self.nthreads ) + # Attribute n_avg_per_thread to each thread (except the last one) + ptcl_chunk_indices = np.array( + [ i_chk*n_avg_per_thread for i_chk in range(self.nthreads+1) ], + dtype=np.uint64 ) + ptcl_chunk_indices[-1] = self.Ntot # Multithreading functions for the deposition of rho or J # for Mode 0 and 1 only. if fieldtype == 'rho': @@ -764,14 +769,14 @@ def deposit( self, fld, fieldtype ) : grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, rho_m0_global, rho_m1_global, - self.nthreads, tx_chunks ) + self.nthreads, ptcl_chunk_indices ) elif self.particle_shape == 'cubic': deposit_rho_prange_cubic( self.x, self.y, self.z, self.w, grid[0].invdz, grid[0].zmin, grid[0].Nz, grid[0].invdr, grid[0].rmin, grid[0].Nr, rho_m0_global, rho_m1_global, - self.nthreads, tx_chunks ) + self.nthreads, ptcl_chunk_indices ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ @@ -810,7 +815,7 @@ def deposit( self, fld, fieldtype ) : Jr_m0_global, Jr_m1_global, Jt_m0_global, Jt_m1_global, Jz_m0_global, Jz_m1_global, - self.nthreads, tx_chunks ) + self.nthreads, ptcl_chunk_indices ) elif self.particle_shape == 'cubic': deposit_J_prange_cubic( self.x, self.y, self.z, self.w, @@ -820,7 +825,7 @@ def deposit( self, fld, fieldtype ) : Jr_m0_global, Jr_m1_global, Jt_m0_global, Jt_m1_global, Jz_m0_global, Jz_m1_global, - self.nthreads, tx_chunks ) + self.nthreads, ptcl_chunk_indices ) else: raise ValueError("`particle_shape` should be either \ 'linear' or 'cubic' \ From d35a7c1eed3d068f30e07a0e0c7859bc62cb5472 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 21 Jul 2017 21:40:23 -0700 Subject: [PATCH 35/36] Changes in variable names and docstring --- .../particles/deposition/threading_methods.py | 433 +++++++++--------- 1 file changed, 214 insertions(+), 219 deletions(-) diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py index 2c85e8236..4f38862a6 100644 --- a/fbpic/particles/deposition/threading_methods.py +++ b/fbpic/particles/deposition/threading_methods.py @@ -78,7 +78,7 @@ def deposit_rho_prange_linear(x, y, z, w, invdz, zmin, Nz, invdr, rmin, Nr, rho_m0_global, rho_m1_global, - nthreads, tx_chunks): + nthreads, ptcl_chunk_indices): """ Deposition of the charge density rho using numba prange on the CPU. Iterates over the threads in parallel, while each thread iterates @@ -90,8 +90,6 @@ def deposit_rho_prange_linear(x, y, z, w, Calculates the weighted amount of rho that is deposited to the 4 cells surounding the particle based on its shape (linear). - The rest of the execution is similar to the CUDA equivalent function. - Parameters ---------- x, y, z : 1darray of floats (in meters) @@ -118,22 +116,23 @@ def deposit_rho_prange_linear(x, y, z, w, nthreads : int Number of CPU threads used with numba prange - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) """ # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): + for i_thread in prange( nthreads ): # Loop over all particles in thread chunk - for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] # Weights - wj = w[ptcl_idx] + wj = w[i_ptcl] # Cylindrical conversion rj = math.sqrt(xj**2 + yj**2) @@ -215,17 +214,17 @@ def deposit_rho_prange_linear(x, y, z, w, shift_z -= Nz # Write ptcl fields to thread-local part of global deposition array - rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00 - rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00 + rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_00 + rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_00 - rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01 - rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01 + rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m0_01 + rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m1_01 - rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10 - rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10 + rho_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m0_10 + rho_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m1_10 - rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 - rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 + rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11 + rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11 return @@ -241,7 +240,7 @@ def deposit_J_prange_linear(x, y, z, w, j_r_m0_global, j_r_m1_global, j_t_m0_global, j_t_m1_global, j_z_m0_global, j_z_m1_global, - nthreads, tx_chunks): + nthreads, ptcl_chunk_indices): """ Deposition of the current density J using numba prange on the CPU. Iterates over the threads in parallel, while each thread iterates @@ -253,8 +252,6 @@ def deposit_J_prange_linear(x, y, z, w, Calculates the weighted amount of J that is deposited to the 4 cells surounding the particle based on its shape (linear). - The rest of the execution is similar to the CUDA equivalent function. - Parameters ---------- x, y, z : 1darray of floats (in meters) @@ -287,28 +284,29 @@ def deposit_J_prange_linear(x, y, z, w, nthreads : int Number of CPU threads used with numba prange - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) """ # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): + for i_thread in prange( nthreads ): # Loop over all particles in thread chunk - for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] + uxj = ux[i_ptcl] + uyj = uy[i_ptcl] + uzj = uz[i_ptcl] # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] + inv_gammaj = inv_gamma[i_ptcl] # Weights - wj = w[ptcl_idx] + wj = w[i_ptcl] # Cylindrical conversion rj = math.sqrt(xj**2 + yj**2) @@ -439,41 +437,41 @@ def deposit_J_prange_linear(x, y, z, w, shift_z -= Nz # Write ptcl fields to thread-local part of global deposition array - j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00 - j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00 + j_r_m0_global[i_thread,iz_cell, ir_cell] += J_r_m0_00 + j_r_m1_global[i_thread,iz_cell, ir_cell] += J_r_m1_00 - j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 - j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 + j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01 + j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01 - j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 - j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 + j_r_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10 + j_r_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10 - j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 - j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 + j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11 + j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11 - j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00 - j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00 + j_t_m0_global[i_thread,iz_cell, ir_cell] += J_t_m0_00 + j_t_m1_global[i_thread,iz_cell, ir_cell] += J_t_m1_00 - j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 - j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 + j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01 + j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01 - j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 - j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 + j_t_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10 + j_t_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10 - j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 - j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 + j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11 + j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11 - j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00 - j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00 + j_z_m0_global[i_thread,iz_cell, ir_cell] += J_z_m0_00 + j_z_m1_global[i_thread,iz_cell, ir_cell] += J_z_m1_00 - j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 - j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 + j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01 + j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01 - j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 - j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 + j_z_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10 + j_z_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10 - j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 - j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 + j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11 + j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11 return @@ -487,9 +485,8 @@ def deposit_rho_prange_cubic(x, y, z, w, invdz, zmin, Nz, invdr, rmin, Nr, rho_m0_global, rho_m1_global, - nthreads, tx_chunks): + nthreads, ptcl_chunk_indices): """ - Deposition of the charge density rho using numba prange on the CPU. Iterates over the threads in parallel, while each thread iterates over a batch of particles. Intermediate results for each threads are @@ -500,8 +497,6 @@ def deposit_rho_prange_cubic(x, y, z, w, Calculates the weighted amount of rho that is deposited to the 16 cells surounding the particle based on its shape (cubic). - The rest of the execution is similar to the CUDA equivalent function. - Parameters ---------- x, y, z : 1darray of floats (in meters) @@ -528,22 +523,23 @@ def deposit_rho_prange_cubic(x, y, z, w, nthreads : int Number of CPU threads used with numba prange - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) """ # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): + for i_thread in prange( nthreads ): # Loop over all particles in thread chunk - for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] # Weights - wj = w[ptcl_idx] + wj = w[i_ptcl] # Cylindrical conversion rj = math.sqrt(xj**2 + yj**2) @@ -774,38 +770,38 @@ def deposit_rho_prange_cubic(x, y, z, w, srl = 1 # Write ptcl fields to thread-local part of global deposition array - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 - rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01 - rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10 - rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11 - rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13 - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 - rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21 - rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 - rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 - rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 - rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31 - rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31 - rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 - rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 - rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 - rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00 + rho_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m0_01 + rho_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m1_01 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m0_10 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m1_10 + rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_11 + rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_11 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m0_12 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m1_12 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m0_13 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m1_13 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20 + rho_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m0_21 + rho_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m1_21 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23 + rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30 + rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30 + rho_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m0_31 + rho_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m1_31 + rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32 + rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32 + rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33 + rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33 return @@ -821,7 +817,7 @@ def deposit_J_prange_cubic(x, y, z, w, j_r_m0_global, j_r_m1_global, j_t_m0_global, j_t_m1_global, j_z_m0_global, j_z_m1_global, - nthreads, tx_chunks): + nthreads, ptcl_chunk_indices): """ Deposition of the current density J using numba prange on the CPU. Iterates over the threads in parallel, while each thread iterates @@ -833,8 +829,6 @@ def deposit_J_prange_cubic(x, y, z, w, Calculates the weighted amount of J that is deposited to the 16 cells surounding the particle based on its shape (cubic). - The rest of the execution is similar to the CUDA equivalent function. - Parameters ---------- x, y, z : 1darray of floats (in meters) @@ -867,28 +861,29 @@ def deposit_J_prange_cubic(x, y, z, w, nthreads : int Number of CPU threads used with numba prange - tx_chunks : list of int - Exact particle batch size per thread. - Last batch size can be greater if Ntot is not a multiple of nthreads. + ptcl_chunk_indices : array of int, of size nthreads+1 + The indices (of the particle array) between which each thread + should loop. (i.e. divisions of particle array between threads) """ # Deposit the field per cell in parallel (for threads < number of cells) - for tx in prange( nthreads ): + for i_thread in prange( nthreads ): # Loop over all particles in thread chunk - for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ): + for i_ptcl in range( ptcl_chunk_indices[i_thread], + ptcl_chunk_indices[i_thread+1] ): # Preliminary arrays for the cylindrical conversion # -------------------------------------------- # Position - xj = x[ptcl_idx] - yj = y[ptcl_idx] - zj = z[ptcl_idx] + xj = x[i_ptcl] + yj = y[i_ptcl] + zj = z[i_ptcl] # Velocity - uxj = ux[ptcl_idx] - uyj = uy[ptcl_idx] - uzj = uz[ptcl_idx] + uxj = ux[i_ptcl] + uyj = uy[i_ptcl] + uzj = uz[i_ptcl] # Inverse gamma - inv_gammaj = inv_gamma[ptcl_idx] + inv_gammaj = inv_gamma[i_ptcl] # Weights - wj = w[ptcl_idx] + wj = w[i_ptcl] # Cylindrical conversion rj = math.sqrt(xj**2 + yj**2) @@ -1402,104 +1397,104 @@ def deposit_J_prange_cubic(x, y, z, w, if (ir_cell-1) < 0: srl = 1 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 - j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01 - j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 - j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11 - j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 - j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21 - j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 - j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 - j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 - j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 - j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 - j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 - j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 - j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 - j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 - - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 - j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01 - j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 - j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11 - j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 - j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21 - j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 - j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 - j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 - j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 - j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 - j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 - j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 - j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 - j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 - - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 - j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01 - j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 - j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11 - j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 - j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21 - j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 - j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 - j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 - j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 - j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 - j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 - j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 - j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 - j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00 + j_r_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m0_01 + j_r_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m1_01 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10 + j_r_m0_global[i_thread, iz_cell, ir_cell] += J_r_m0_11 + j_r_m1_global[i_thread, iz_cell, ir_cell] += J_r_m1_11 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m0_12 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m1_12 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20 + j_r_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m0_21 + j_r_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m1_21 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23 + j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30 + j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30 + j_r_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31 + j_r_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31 + j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32 + j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32 + j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33 + j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33 + + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00 + j_t_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m0_01 + j_t_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m1_01 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03 + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10 + j_t_m0_global[i_thread, iz_cell, ir_cell] += J_t_m0_11 + j_t_m1_global[i_thread, iz_cell, ir_cell] += J_t_m1_11 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m0_12 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m1_12 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13 + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20 + j_t_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m0_21 + j_t_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m1_21 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23 + j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30 + j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30 + j_t_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31 + j_t_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31 + j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32 + j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32 + j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33 + j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33 + + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00 + j_z_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m0_01 + j_z_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m1_01 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03 + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10 + j_z_m0_global[i_thread, iz_cell, ir_cell] += J_z_m0_11 + j_z_m1_global[i_thread, iz_cell, ir_cell] += J_z_m1_11 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m0_12 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m1_12 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13 + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20 + j_z_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m0_21 + j_z_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m1_21 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23 + j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30 + j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30 + j_z_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31 + j_z_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31 + j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32 + j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32 + j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33 + j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33 return From 62045854951d505d72d18c361aba02c9fe9170ce Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Fri, 21 Jul 2017 21:54:17 -0700 Subject: [PATCH 36/36] Removed all mentions of linear_non_atomic --- fbpic/main.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fbpic/main.py b/fbpic/main.py index 647d9c287..6c3e6c56f 100644 --- a/fbpic/main.py +++ b/fbpic/main.py @@ -182,10 +182,8 @@ def dens_func( z, r ) ... particle_shape: str, optional Set the particle shape for the charge/current deposition. - Possible values are 'cubic', 'linear' and 'linear_non_atomic'. - While 'cubic' corresponds to third order shapes and 'linear' - to first order shapes, 'linear_non_atomic' uses an equivalent - deposition scheme to 'linear' which avoids atomics on the GPU. + Possible values are 'cubic', 'linear'. ('cubic' corresponds to + third order shapes and 'linear' to first order shapes). """ # Check whether to use CUDA self.use_cuda = use_cuda