diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py
index 33d2d921e..f08775b79 100644
--- a/fbpic/boundaries/moving_window.py
+++ b/fbpic/boundaries/moving_window.py
@@ -9,6 +9,7 @@
 from scipy.constants import c
 from fbpic.particles import Particles
 from fbpic.lpa_utils.boosted_frame import BoostConverter
+from fbpic.threading_utils import njit_parallel, prange
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -318,86 +319,36 @@ def shift_spect_grid( self, grid, n_move,
         """
         if grid.use_cuda:
             shift = grid.d_field_shift
+            # Get a 2D CUDA grid of the size of the grid
+            tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] )
             # Shift all the fields on the GPU
-            self.shift_spect_field_gpu( grid.Ep, shift, n_move )
-            self.shift_spect_field_gpu( grid.Em, shift, n_move )
-            self.shift_spect_field_gpu( grid.Ez, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bp, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bm, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bz, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move )
             if shift_rho:
-                self.shift_spect_field_gpu( grid.rho_prev, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move )
             if shift_currents:
-                self.shift_spect_field_gpu( grid.Jp, shift, n_move )
-                self.shift_spect_field_gpu( grid.Jm, shift, n_move )
-                self.shift_spect_field_gpu( grid.Jz, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move )
         else:
             shift = grid.field_shift
             # Shift all the fields on the CPU
-            self.shift_spect_field( grid.Ep, shift, n_move )
-            self.shift_spect_field( grid.Em, shift, n_move )
-            self.shift_spect_field( grid.Ez, shift, n_move )
-            self.shift_spect_field( grid.Bp, shift, n_move )
-            self.shift_spect_field( grid.Bm, shift, n_move )
-            self.shift_spect_field( grid.Bz, shift, n_move )
+            shift_spect_array_cpu( grid.Ep, shift, n_move )
+            shift_spect_array_cpu( grid.Em, shift, n_move )
+            shift_spect_array_cpu( grid.Ez, shift, n_move )
+            shift_spect_array_cpu( grid.Bp, shift, n_move )
+            shift_spect_array_cpu( grid.Bm, shift, n_move )
+            shift_spect_array_cpu( grid.Bz, shift, n_move )
             if shift_rho:
-                self.shift_spect_field( grid.rho_prev, shift, n_move )
+                shift_spect_array_cpu( grid.rho_prev, shift, n_move )
             if shift_currents:
-                self.shift_spect_field( grid.Jp, shift, n_move )
-                self.shift_spect_field( grid.Jm, shift, n_move )
-                self.shift_spect_field( grid.Jz, shift, n_move )
-
-    def shift_spect_field( self, field_array, shift_factor, n_move ):
-        """
-        Shift the field 'field_array' by n_move cells.
-        This is done in spectral space and corresponds to multiplying the
-        fields with the factor exp(i*kz_true*dz)**n_move .
-        (Typically n_move is positive, and the fields are shifted backwards)
-
-        Parameters
-        ----------
-        field_array: 2darray of complexs
-            Contains the value of the fields, and is modified by
-            this function
-
-        shift_factor: 1darray of complexs
-            Contains the shift array, that is multiplied to the fields in
-            spectral space to shift them by one cell in spatial space
-            ( exp(i*kz_true*dz) )
-
-        n_move: int
-            The number of cells by which the grid should be shifted
-        """
-        # Multiply with (shift_factor*sign(n_move))**n_move
-        field_array *= ( shift_factor[:, np.newaxis] )**n_move
-
-    def shift_spect_field_gpu( self, field_array, shift_factor, n_move):
-        """
-        Shift the field 'field_array' by n_move cells on the GPU.
-        This is done in spectral space and corresponds to multiplying the
-        fields with the factor exp(i*kz_true*dz)**n_move .
-        (Typically n_move is positive, and the fields are shifted backwards)
-
-        Parameters
-        ----------
-        field_array: 2darray of complexs
-            Contains the value of the fields, and is modified by
-            this function
-
-        shift_factor: 1darray of complexs
-            Contains the shift array, that is multiplied to the fields in
-            spectral space to shift them by one cell in spatial space
-            ( exp(i*kz_true*dz) )
-
-        n_move: int
-            The number of cells by which the grid should be shifted
-        """
-        # Get a 2D CUDA grid of the size of the grid
-        dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d(
-            field_array.shape[0], field_array.shape[1] )
-        # Shift the field array in place
-        shift_spect_array_gpu[dim_grid_2d, dim_block_2d](
-            field_array, shift_factor, n_move)
+                shift_spect_array_cpu( grid.Jp, shift, n_move )
+                shift_spect_array_cpu( grid.Jm, shift, n_move )
+                shift_spect_array_cpu( grid.Jz, shift, n_move )
 
     def shift_interp_grid( self, grid, n_move,
                            shift_rho=True, shift_currents=False ):
@@ -513,6 +464,39 @@ def shift_interp_field_gpu( self, field_array, n_move):
         # Return the new shifted field array
         return( field_array )
 
+@njit_parallel
+def shift_spect_array_cpu( field_array, shift_factor, n_move ):
+    """
+    Shift the field 'field_array' by n_move cells on CPU.
+    This is done in spectral space and corresponds to multiplying the
+    fields with the factor exp(i*kz_true*dz)**n_move .
+
+    Parameters
+    ----------
+    field_array: 2darray of complexs
+        Contains the value of the fields, and is modified by
+        this function
+
+    shift_factor: 1darray of complexs
+        Contains the shift array, that is multiplied to the fields in
+        spectral space to shift them by one cell in spatial space
+        ( exp(i*kz_true*dz) )
+
+    n_move: int
+        The number of cells by which the grid should be shifted
+    """
+    Nz, Nr = field_array.shape
+
+    # Loop over the 2D array (in parallel over z if threading is enabled)
+    for iz in prange( Nz ):
+        power_shift = shift_factor[iz]
+        # Calculate the shift factor (raising to the power n_move)
+        for i in range(1,n_move):
+            power_shift *= shift_factor[iz]
+        # Shift fields backwards
+        for ir in range( Nr ):
+            field_array[iz, ir] *= power_shift
+
 if cuda_installed:
 
     @cuda.jit('void(complex128[:,:], complex128[:,:], int32)')
diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
index 830fec37a..e80652181 100644
--- a/fbpic/fields/numba_methods.py
+++ b/fbpic/fields/numba_methods.py
@@ -5,21 +5,18 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines the optimized fields methods that use numba on a CPU
 """
-import numba
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
+from fbpic.threading_utils import njit_parallel, prange
 
-@numba.jit('void(complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64, int32, int32)')
+@njit_parallel
 def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2, inv_dt, Nz, Nr ):
     """
     Correct the currents in spectral space, using the standard pstad
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -33,13 +30,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \
-           int8, int32, int32)')
+    return
+
+@njit_parallel
 def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -50,8 +43,8 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
 
     See the documentation of SpectralGrid.push_eb_with
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Save the electric fields, since it is needed for the B push
@@ -106,7 +99,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
 
-@numba.jit
+    return
+
+@njit_parallel
 def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2,
                             j_corr_coef, T_eb, T_cc,
@@ -115,8 +110,8 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
     Correct the currents in spectral space, using the assumption
     of comoving currents
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -130,7 +125,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit
+    return
+
+@njit_parallel
 def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -207,3 +204,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                             + 1.j*kr[iz, ir]*Em_old ) \
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
+
+    return
diff --git a/fbpic/fields/spectral_transform/spectral_transformer.py b/fbpic/fields/spectral_transform/spectral_transformer.py
index a9f99222f..af2d2858b 100644
--- a/fbpic/fields/spectral_transform/spectral_transformer.py
+++ b/fbpic/fields/spectral_transform/spectral_transformer.py
@@ -9,6 +9,7 @@
 from .hankel import DHT
 from .fourier import FFT
 
+from .threading_methods import numba_rt_to_pm, numba_pm_to_rt
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m,
                 self.spect_buffer_r, self.spect_buffer_t )
         else :
             # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy)
-            self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \
-                    ( self.spect_buffer_p + self.spect_buffer_m), \
-                1.j*( self.spect_buffer_p - self.spect_buffer_m)
+            # (self.spect_buffer_r and self.spect_buffer_t are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_p, self.spect_buffer_m,
+            # for economy of memory)
+            numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m,
+                            self.spect_buffer_r, self.spect_buffer_t )
 
         # Finally perform the FFT (along axis 0, which corresponds to z)
         self.fft.inverse_transform( self.spect_buffer_r, interp_array_r )
@@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t,
                 self.spect_buffer_r, self.spect_buffer_t,
                 self.spect_buffer_p, self.spect_buffer_m )
         else :
-            # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy.)
-            self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \
-                0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \
-                0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t )
+            # Combine them on the GPU
+            # (self.spect_buffer_p and self.spect_buffer_m are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_r, self.spect_buffer_t,
+            # for economy of memory)
+            numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t,
+                            self.spect_buffer_p, self.spect_buffer_m )
 
         # Perform the inverse DHT (along axis -1, which corresponds to r)
         self.dhtp.transform( self.spect_buffer_p, spect_array_p )
diff --git a/fbpic/fields/spectral_transform/threading_methods.py b/fbpic/fields/spectral_transform/threading_methods.py
new file mode 100644
index 000000000..ce16dbeef
--- /dev/null
+++ b/fbpic/fields/spectral_transform/threading_methods.py
@@ -0,0 +1,58 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines a set of functions that are useful when converting the
+fields from interpolation grid to the spectral grid and vice-versa
+"""
+from fbpic.threading_utils import prange, njit_parallel
+
+# ----------------------------------------------------
+# Functions that combine components in spectral space
+# ----------------------------------------------------
+
+@njit_parallel
+def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
+    """
+    Combine the arrays buffer_r and buffer_t to produce the
+    arrays buffer_p and buffer_m, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_r.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_r = buffer_r[iz, ir]
+            value_t = buffer_t[iz, ir]
+            # Combine the values
+            buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t )
+            buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t )
+
+
+@njit_parallel
+def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) :
+    """
+    Combine the arrays buffer_p and buffer_m to produce the
+    arrays buffer_r and buffer_t, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_p.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_p = buffer_p[iz, ir]
+            value_m = buffer_m[iz, ir]
+            # Combine the values
+            buffer_r[iz, ir] =     ( value_p + value_m )
+            buffer_t[iz, ir] = 1.j*( value_p - value_m )
diff --git a/fbpic/lpa_utils/laser/antenna.py b/fbpic/lpa_utils/laser/antenna.py
index 3f3e8425c..d1aaf10b0 100644
--- a/fbpic/lpa_utils/laser/antenna.py
+++ b/fbpic/lpa_utils/laser/antenna.py
@@ -10,8 +10,8 @@
 from scipy.constants import e, c, epsilon_0, physical_constants
 r_e = physical_constants['classical electron radius'][0]
 from .profiles import gaussian_profile
-from fbpic.particles.utility_methods import weights
-from fbpic.particles.numba_methods import deposit_field_numba
+from fbpic.particles.utilities.utility_methods import weights
+from fbpic.particles.deposition.numba_methods import deposit_field_numba
 
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
diff --git a/fbpic/main.py b/fbpic/main.py
index f508b0f69..6c3e6c56f 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -10,6 +10,9 @@
 # (This needs to be done before the other imports,
 # as it sets the cuda context)
 from mpi4py import MPI
+import numba
+# Check if threading is available
+from .threading_utils import threading_enabled
 # Check if CUDA is available, then import CUDA functions
 from .cuda_utils import cuda_installed
 if cuda_installed:
@@ -41,12 +44,10 @@ class Simulation(object):
     def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax,
                  p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0.,
                  n_order=-1, dens_func=None, filter_currents=True,
-                 v_comoving=None, use_galilean=False,
-                 initialize_ions=False, use_cuda=False,
-                 n_guard=None, n_damp=30,
-                 exchange_period=None, boundaries='periodic',
-                 gamma_boost=None, use_all_mpi_ranks=True,
-                 particle_shape='linear' ):
+                 v_comoving=None, use_galilean=False, initialize_ions=False,
+                 use_cuda=False, n_guard=None, n_damp=30, exchange_period=None,
+                 boundaries='periodic', gamma_boost=None,
+                 use_all_mpi_ranks=True, particle_shape='linear' ):
         """
         Initializes a simulation, by creating the following structures:
 
@@ -181,17 +182,17 @@ def dens_func( z, r ) ...
 
         particle_shape: str, optional
             Set the particle shape for the charge/current deposition.
-            Possible values are 'cubic', 'linear' and 'linear_non_atomic'.
-            While 'cubic' corresponds to third order shapes and 'linear'
-            to first order shapes, 'linear_non_atomic' uses an equivalent
-            deposition scheme to 'linear' which avoids atomics on the GPU.
+            Possible values are 'cubic', 'linear'. ('cubic' corresponds to
+            third order shapes and 'linear' to first order shapes).
         """
-        # Check whether to use cuda
+        # Check whether to use CUDA
         self.use_cuda = use_cuda
         if (use_cuda==True) and (cuda_installed==False):
             print('*** Cuda not available for the simulation.')
             print('*** Performing the simulation on CPU.')
             self.use_cuda = False
+        # CPU multi-threading
+        self.use_threading = threading_enabled
 
         # Register the comoving parameters
         self.v_comoving = v_comoving
@@ -212,7 +213,7 @@ def dens_func( z, r ) ...
         self.comm = BoundaryCommunicator( Nz, zmin, zmax, Nr, rmax, Nm, dt,
             boundaries, n_order, n_guard, n_damp, exchange_period,
             use_all_mpi_ranks )
-        print_simulation_setup( self.comm, self.use_cuda )
+        print_simulation_setup( self.comm, self.use_cuda, self.use_threading )
         # Modify domain region
         zmin, zmax, p_zmin, p_zmax, Nz = \
               self.comm.divide_into_domain(zmin, zmax, p_zmin, p_zmax)
@@ -234,19 +235,18 @@ def dens_func( z, r ) ...
         # Initialize the electrons and the ions
         grid_shape = self.fld.interp[0].Ez.shape
         self.ptcl = [
-            Particles( q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin,
-                       zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
-                       Nptheta=p_nt, dt=dt, dens_func=dens_func,
-                       use_cuda=self.use_cuda, uz_m=uz_m,
-                       grid_shape=grid_shape, particle_shape=particle_shape) ]
+            Particles(q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin,
+                      zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
+                      Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
+                      grid_shape=grid_shape, particle_shape=particle_shape,
+                      use_cuda=self.use_cuda ) ]
         if initialize_ions :
             self.ptcl.append(
                 Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin,
                           zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
-                          Nptheta=p_nt, dt=dt, dens_func=dens_func,
-                          use_cuda=self.use_cuda, uz_m=uz_m,
-                          grid_shape=grid_shape,
-                          particle_shape=particle_shape ) )
+                          Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
+                          grid_shape=grid_shape, particle_shape=particle_shape,
+                          use_cuda=self.use_cuda ) )
 
         # Register the number of particles per cell along z, and dt
         # (Necessary for the moving window)
@@ -578,7 +578,7 @@ def progression_bar( i, Ntot, measured_start, Nbars=50, char='-'):
     sys.stdout.write(', %d:%02d:%02d left' % (h, m, s))
     sys.stdout.flush()
 
-def print_simulation_setup( comm, use_cuda ):
+def print_simulation_setup( comm, use_cuda, use_threading ):
     """
     Print message about the number of proc and
     whether it is using GPU or CPU.
@@ -590,13 +590,20 @@ def print_simulation_setup( comm, use_cuda ):
 
     use_cuda: bool
         Whether the simulation is set up to use CUDA
+
+    use_threading: bool
+        Whether the simulation is set up to use threads on CPU
     """
     if comm.rank == 0:
         if use_cuda:
             message = "\nRunning FBPIC on GPU "
         else:
             message = "\nRunning FBPIC on CPU "
-        message += "with %d proc.\n" %comm.size
+        message += "with %d proc" %comm.size
+        if use_threading and not use_cuda:
+            message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS
+        message += ".\n"
+
         print( message )
 
 def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ):
diff --git a/fbpic/particles/cuda_deposition/cubic.py b/fbpic/particles/cuda_deposition/cubic.py
deleted file mode 100644
index 45e15e0eb..000000000
--- a/fbpic/particles/cuda_deposition/cubic.py
+++ /dev/null
@@ -1,1214 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for cubic order shapes
-"""
-from numba import cuda, int64
-import math
-from scipy.constants import c
-import numpy as np
-
-# Shape Factor helper functions to compute particle shapes.
-
-
-@cuda.jit(device=True, inline=True)
-def z_shape(cell_position, index):
-    iz = int64(math.floor(cell_position)) - 1
-    if index == 0:
-        return (-1./6.)*((cell_position-iz)-2)**3
-    if index == 1:
-        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
-    if index == 2:
-        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
-    if index == 3:
-        return (-1./6.)*(((iz+3)-cell_position)-2)**3
-
-
-@cuda.jit(device=True, inline=True)
-def r_shape(cell_position, index):
-    flip_factor = 1.
-    ir = int64(math.floor(cell_position)) - 1
-    if index == 0:
-        if ir < 0:
-            flip_factor = -1.
-        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
-    if index == 1:
-        if ir+1 < 0:
-            flip_factor = -1.
-        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
-    if index == 2:
-        if ir+2 < 0:
-            flip_factor = -1.
-        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
-    if index == 3:
-        if ir+3 < 0:
-            flip_factor = -1.
-        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
-
-# -------------------------------
-# Field deposition utility - rho
-# -------------------------------
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                int32[:], int32[:])')
-def deposit_rho_gpu_cubic(x, y, z, w,
-                          invdz, zmin, Nz,
-                          invdr, rmin, Nr,
-                          rho_m0, rho_m1,
-                          cell_idx, prefix_sum):
-    """
-    Deposition of the charge density rho using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of rho that is deposited to the
-    16 cells surounding the particle based on its shape (cubic).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 16 variables (one for each surrounding cell) to
-    maintain parallelism while avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i / Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare local field arrays
-        R_m0_00 = 0.
-        R_m1_00 = 0. + 0.j
-
-        R_m0_01 = 0.
-        R_m1_01 = 0. + 0.j
-
-        R_m0_02 = 0.
-        R_m1_02 = 0. + 0.j
-
-        R_m0_03 = 0.
-        R_m1_03 = 0. + 0.j
-
-        R_m0_10 = 0.
-        R_m1_10 = 0. + 0.j
-
-        R_m0_11 = 0.
-        R_m1_11 = 0. + 0.j
-
-        R_m0_12 = 0.
-        R_m1_12 = 0. + 0.j
-
-        R_m0_13 = 0.
-        R_m1_13 = 0. + 0.j
-
-        R_m0_20 = 0.
-        R_m1_20 = 0. + 0.j
-
-        R_m0_21 = 0.
-        R_m1_21 = 0. + 0.j
-
-        R_m0_22 = 0.
-        R_m1_22 = 0. + 0.j
-
-        R_m0_23 = 0.
-        R_m1_23 = 0. + 0.j
-
-        R_m0_30 = 0.
-        R_m1_30 = 0. + 0.j
-
-        R_m0_31 = 0.
-        R_m1_31 = 0. + 0.j
-
-        R_m0_32 = 0.
-        R_m1_32 = 0. + 0.j
-
-        R_m0_33 = 0.
-        R_m1_33 = 0. + 0.j
-
-        for j in range(frequency_per_cell):
-            # Get the particle index before the sorting
-            # --------------------------------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0_scal = wj * exptheta_m0
-            # Mode 1
-            R_m1_scal = wj * exptheta_m1
-            # Compute values in local copies and consider boundaries
-            ir0 = int64(math.floor(r_cell)) - 1
-
-            if (ir0 == -2):
-                R_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal
-
-            if (ir0 == -1):
-                R_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal
-            if (ir0 >= 0):
-                R_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal
-
-        # Index Shifting since local copies are centered around
-        # the current cell
-        srl = 0         # shift r lower
-        sru = 0         # shift r upper inner
-        sru2 = 0        # shift r upper outer
-        szl = 0         # shift z lower
-        szu = 0         # shift z upper inner
-        szu2 = 0        # shift z upper outer
-        if (iz_cell-1) < 0:
-            szl += Nz
-        if (iz_cell) == (Nz - 1):
-            szu -= Nz
-            szu2 -= Nz
-        if (iz_cell+1) == (Nz - 1):
-            szu2 -= Nz
-        if (ir_cell) >= (Nr - 1):
-            sru = -1
-            sru2 = -2
-        if (ir_cell+1) == (Nr - 1):
-            sru2 = -1
-        if (ir_cell-1) < 0:
-            srl = 1
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag)
-
-
-# -------------------------------
-# Field deposition utility - J
-# -------------------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:],\
-                int32[:], int32[:])')
-def deposit_J_gpu_cubic(x, y, z, w,
-                        ux, uy, uz, inv_gamma,
-                        invdz, zmin, Nz,
-                        invdr, rmin, Nr,
-                        j_r_m0, j_r_m1,
-                        j_t_m0, j_t_m1,
-                        j_z_m0, j_z_m1,
-                        cell_idx, prefix_sum):
-    """
-    Deposition of the current J using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of J that is deposited to the
-    16 cells surounding the particle based on its shape (cubic).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 16 variables (one for each cell) to maintain
-    parallelism while avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i/Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare the local field value for
-        # all possible deposition directions,
-        # depending on the shape order and per mode for r,t and z.
-        J_r_m0_00 = 0.
-        J_t_m0_00 = 0.
-        J_z_m0_00 = 0.
-        J_r_m1_00 = 0. + 0.j
-        J_t_m1_00 = 0. + 0.j
-        J_z_m1_00 = 0. + 0.j
-
-        J_r_m0_01 = 0.
-        J_t_m0_01 = 0.
-        J_z_m0_01 = 0.
-        J_r_m1_01 = 0. + 0.j
-        J_t_m1_01 = 0. + 0.j
-        J_z_m1_01 = 0. + 0.j
-
-        J_r_m0_02 = 0.
-        J_t_m0_02 = 0.
-        J_z_m0_02 = 0.
-        J_r_m1_02 = 0. + 0.j
-        J_t_m1_02 = 0. + 0.j
-        J_z_m1_02 = 0. + 0.j
-
-        J_r_m0_03 = 0.
-        J_t_m0_03 = 0.
-        J_z_m0_03 = 0.
-        J_r_m1_03 = 0. + 0.j
-        J_t_m1_03 = 0. + 0.j
-        J_z_m1_03 = 0. + 0.j
-
-        J_r_m0_10 = 0.
-        J_t_m0_10 = 0.
-        J_z_m0_10 = 0.
-        J_r_m1_10 = 0. + 0.j
-        J_t_m1_10 = 0. + 0.j
-        J_z_m1_10 = 0. + 0.j
-
-        J_r_m0_11 = 0.
-        J_t_m0_11 = 0.
-        J_z_m0_11 = 0.
-        J_r_m1_11 = 0. + 0.j
-        J_t_m1_11 = 0. + 0.j
-        J_z_m1_11 = 0. + 0.j
-
-        J_r_m0_12 = 0.
-        J_t_m0_12 = 0.
-        J_z_m0_12 = 0.
-        J_r_m1_12 = 0. + 0.j
-        J_t_m1_12 = 0. + 0.j
-        J_z_m1_12 = 0. + 0.j
-
-        J_r_m0_13 = 0.
-        J_t_m0_13 = 0.
-        J_z_m0_13 = 0.
-        J_r_m1_13 = 0. + 0.j
-        J_t_m1_13 = 0. + 0.j
-        J_z_m1_13 = 0. + 0.j
-
-        J_r_m0_20 = 0.
-        J_t_m0_20 = 0.
-        J_z_m0_20 = 0.
-        J_r_m1_20 = 0. + 0.j
-        J_t_m1_20 = 0. + 0.j
-        J_z_m1_20 = 0. + 0.j
-
-        J_r_m0_21 = 0.
-        J_t_m0_21 = 0.
-        J_z_m0_21 = 0.
-        J_r_m1_21 = 0. + 0.j
-        J_t_m1_21 = 0. + 0.j
-        J_z_m1_21 = 0. + 0.j
-
-        J_r_m0_22 = 0.
-        J_t_m0_22 = 0.
-        J_z_m0_22 = 0.
-        J_r_m1_22 = 0. + 0.j
-        J_t_m1_22 = 0. + 0.j
-        J_z_m1_22 = 0. + 0.j
-
-        J_r_m0_23 = 0.
-        J_t_m0_23 = 0.
-        J_z_m0_23 = 0.
-        J_r_m1_23 = 0. + 0.j
-        J_t_m1_23 = 0. + 0.j
-        J_z_m1_23 = 0. + 0.j
-
-        J_r_m0_30 = 0.
-        J_t_m0_30 = 0.
-        J_z_m0_30 = 0.
-        J_r_m1_30 = 0. + 0.j
-        J_t_m1_30 = 0. + 0.j
-        J_z_m1_30 = 0. + 0.j
-
-        J_r_m0_31 = 0.
-        J_t_m0_31 = 0.
-        J_z_m0_31 = 0.
-        J_r_m1_31 = 0. + 0.j
-        J_t_m1_31 = 0. + 0.j
-        J_z_m1_31 = 0. + 0.j
-
-        J_r_m0_32 = 0.
-        J_t_m0_32 = 0.
-        J_z_m0_32 = 0.
-        J_r_m1_32 = 0. + 0.j
-        J_t_m1_32 = 0. + 0.j
-        J_z_m1_32 = 0. + 0.j
-
-        J_r_m0_33 = 0.
-        J_t_m0_33 = 0.
-        J_z_m0_33 = 0.
-        J_r_m1_33 = 0. + 0.j
-        J_t_m1_33 = 0. + 0.j
-        J_z_m1_33 = 0. + 0.j
-
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index
-            # ----------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Get weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
-            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
-            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
-            # Mode 1
-            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
-            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
-            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
-
-            # Compute values in local copies and consider boundaries
-            ir0 = int64(math.floor(r_cell)) - 1
-
-            if (ir0 == -2):
-                J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal
-                J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-
-                J_t_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_z_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal
-            if (ir0 == -1):
-                J_r_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal
-            if (ir0 >= 0):
-                J_r_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_t_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_z_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal
-
-        # Index Shifting since local copies are centered around
-        # the current cell
-        srl = 0         # shift r lower
-        sru = 0         # shift r upper inner
-        sru2 = 0        # shift r upper outer
-        szl = 0         # shift z lower
-        szu = 0         # shift z upper inner
-        szu2 = 0        # shift z upper outer
-        if (iz_cell-1) < 0:
-            szl += Nz
-        if (iz_cell) == (Nz - 1):
-            szu -= Nz
-            szu2 -= Nz
-        if (iz_cell+1) == (Nz - 1):
-            szu2 -= Nz
-        if (ir_cell) >= (Nr - 1):
-            sru = -1
-            sru2 = -2
-        if (ir_cell+1) == (Nr - 1):
-            sru2 = -1
-        if (ir_cell-1) < 0:
-            srl = 1
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag)
-
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag)
diff --git a/fbpic/particles/cuda_deposition/linear.py b/fbpic/particles/cuda_deposition/linear.py
deleted file mode 100644
index 48adff845..000000000
--- a/fbpic/particles/cuda_deposition/linear.py
+++ /dev/null
@@ -1,482 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for linear order shapes
-"""
-from numba import cuda, int64
-import math
-from scipy.constants import c
-import numpy as np
-
-
-# Shape Factor functions to compute particle shapes.
-@cuda.jit(device=True, inline=True)
-def get_z_shape_linear(cell_position, index):
-    iz = int64(math.floor(cell_position))
-    if index == 0:
-        return iz+1.-cell_position
-    if index == 1:
-        return cell_position - iz
-
-
-@cuda.jit(device=True, inline=True)
-def get_r_shape_linear(cell_position, index):
-    flip_factor = 1.
-    ir = int64(math.floor(cell_position))
-    if index == 0:
-        if ir < 0:
-            flip_factor = -1.
-        return flip_factor*(ir+1.-cell_position)
-    if index == 1:
-        return flip_factor*(cell_position - ir)
-
-
-# -------------------------------
-# Field deposition utility - rho
-# -------------------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                int32[:], int32[:])')
-def deposit_rho_gpu_linear(x, y, z, w,
-                           invdz, zmin, Nz,
-                           invdr, rmin, Nr,
-                           rho_m0, rho_m1,
-                           cell_idx, prefix_sum):
-    """
-    Deposition of the charge density rho using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of rho that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 variables (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i / Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare local field arrays
-        R_m0_00 = 0.
-        R_m0_01 = 0.
-        R_m0_10 = 0.
-        R_m0_11 = 0.
-
-        R_m1_00 = 0. + 0.j
-        R_m1_01 = 0. + 0.j
-        R_m1_10 = 0. + 0.j
-        R_m1_11 = 0. + 0.j
-
-        for j in range(frequency_per_cell):
-            # Get the particle index before the sorting
-            # --------------------------------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0_scal = wj * exptheta_m0
-            # Mode 1
-            R_m1_scal = wj * exptheta_m1
-
-            # Boundary Region Shifts
-            ir_lower = int64(math.floor(r_cell))
-
-            R_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m0_scal
-            R_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m0_scal
-            R_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m1_scal
-            R_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m1_scal
-
-            if ir_lower == -1:
-                R_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal
-            else:
-                R_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal
-
-        # Cell shifts for the simulation boundaries
-        shift_r = 0
-        shift_z = 0
-        if ir_cell+1 > (Nr-1):
-            shift_r = -1
-        if iz_cell+1 > Nz-1:
-            shift_z -= Nz
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag)
-
-
-# -------------------------------
-# Field deposition utility - J
-# -------------------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:],\
-                int32[:], int32[:])')
-def deposit_J_gpu_linear(x, y, z, w,
-                         ux, uy, uz, inv_gamma,
-                         invdz, zmin, Nz,
-                         invdr, rmin, Nr,
-                         j_r_m0, j_r_m1,
-                         j_t_m0, j_t_m1,
-                         j_z_m0, j_z_m1,
-                         cell_idx, prefix_sum):
-    """
-    Deposition of the current J using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of J that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 variables (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i/Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare the local field value for
-        # all possible deposition directions,
-        # depending on the shape order and per mode for r,t and z.
-
-        J_r_m0_00 = 0.
-        J_r_m1_00 = 0. + 0.j
-        J_t_m0_00 = 0.# + 0.j
-        J_t_m1_00 = 0. + 0.j
-        J_z_m0_00 = 0.
-        J_z_m1_00 = 0. + 0.j
-
-        J_r_m0_01 = 0.
-        J_r_m1_01 = 0. + 0.j
-        J_t_m0_01 = 0.
-        J_t_m1_01 = 0. + 0.j
-        J_z_m0_01 = 0.
-        J_z_m1_01 = 0. + 0.j
-
-        J_r_m0_10 = 0.
-        J_r_m1_10 = 0. + 0.j
-        J_t_m0_10 = 0.
-        J_t_m1_10 = 0. + 0.j
-        J_z_m0_10 = 0.
-        J_z_m1_10 = 0. + 0.j
-
-        J_r_m0_11 = 0.
-        J_r_m1_11 = 0. + 0.j
-        J_t_m0_11 = 0.
-        J_t_m1_11 = 0. + 0.j
-        J_z_m0_11 = 0.
-        J_z_m1_11 = 0. + 0.j
-
-
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index
-            # ----------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Get weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
-            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
-            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
-            # Mode 1
-            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
-            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
-            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
-
-            # Deposit on local copies at respective position
-            ir_lower = int64(math.floor(r_cell))
-
-            J_r_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal
-            J_t_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal
-            J_z_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal
-            J_r_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal
-            J_t_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal
-            J_z_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal
-            J_r_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal
-            J_t_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal
-            J_z_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal
-            J_r_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal
-            J_t_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal
-            J_z_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal
-
-            # Take into account lower r flips
-            if ir_lower == -1:
-                J_r_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal
-            else:
-                J_r_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal
-
-        # Cell shifts for the simulation boundaries
-        shift_r = 0
-        shift_z = 0
-        if (ir_cell+1) > (Nr-1):
-            shift_r = -1
-        if (iz_cell+1) > Nz-1:
-            shift_z -= Nz
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag)
diff --git a/fbpic/particles/cuda_deposition/linear_non_atomic.py b/fbpic/particles/cuda_deposition/linear_non_atomic.py
deleted file mode 100644
index 6dea16d21..000000000
--- a/fbpic/particles/cuda_deposition/linear_non_atomic.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for linear order shapes
-without using atomic operations.
-"""
-from numba import cuda
-import math
-from scipy.constants import c
-import numpy as np
-
-# -------------------------------
-# Field deposition utility - rho
-# -------------------------------
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:],\
-                int32[:], int32[:])')
-def deposit_rho_gpu(x, y, z, w,
-                    invdz, zmin, Nz,
-                    invdr, rmin, Nr,
-                    rho0, rho1,
-                    rho2, rho3,
-                    cell_idx, prefix_sum):
-    """
-    Deposition of the charge density rho using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of rho that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 arrays (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho0, rho1, rho2, rho3 : 3darray of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz = int(i / Nr)
-        ir = int(i - iz * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-        # Initialize the local field value for
-        # all four possible deposition directions
-        # Mode 0, 1 for r, t, z
-        # 1 : lower in r, lower in z
-        # 2 : lower in r, upper in z
-        # 3 : upper in r, lower in z
-        # 4 : upper in r, upper in z
-        R1_m0 = 0. + 0.j
-        R2_m0 = 0. + 0.j
-        R3_m0 = 0. + 0.j
-        R4_m0 = 0. + 0.j
-        # ------------
-        R1_m1 = 0. + 0.j
-        R2_m1 = 0. + 0.j
-        R3_m1 = 0. + 0.j
-        R4_m1 = 0. + 0.j
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index before the sorting
-            # --------------------------------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset - 1 - j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1. / rj
-                cos = xj * invr  # Cosine
-                sin = yj * invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j * sin
-
-            # Get linear weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr * (rj - rmin) - 0.5
-            z_cell = invdz * (zj - zmin) - 0.5
-            # Original index of the uppper and lower cell
-            ir_lower = int(math.floor(r_cell))
-            ir_upper = ir_lower + 1
-            iz_lower = int(math.floor(z_cell))
-            iz_upper = iz_lower + 1
-            # Linear weight
-            Sr_lower = ir_upper - r_cell
-            Sr_upper = r_cell - ir_lower
-            Sz_lower = iz_upper - z_cell
-            Sz_upper = z_cell - iz_lower
-            # Set guard weights to zero
-            Sr_guard = 0.
-
-            # Treat the boundary conditions
-            # --------------------------------------------
-            # guard cells in lower r
-            if ir_lower < 0:
-                Sr_guard = Sr_lower
-                Sr_lower = 0.
-                ir_lower = 0
-            # absorbing in upper r
-            if ir_lower > Nr - 1:
-                ir_lower = Nr - 1
-            if ir_upper > Nr - 1:
-                ir_upper = Nr - 1
-            # periodic boundaries in z
-            # lower z boundaries
-            if iz_lower < 0:
-                iz_lower += Nz
-            if iz_upper < 0:
-                iz_upper += Nz
-            # upper z boundaries
-            if iz_lower > Nz - 1:
-                iz_lower -= Nz
-            if iz_upper > Nz - 1:
-                iz_upper -= Nz
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0 = wj * exptheta_m0
-            # Mode 1
-            R_m1 = wj * exptheta_m1
-
-            # Caculate the weighted currents for each
-            # of the four possible direction
-            # --------------------------------------------
-            if ir_lower == ir_upper:
-                # In the case that ir_lower and ir_upper are equal,
-                # the current is added only to the array corresponding
-                # to ir_lower.
-                # (This is the case for the boundaries in r)
-                R1_m0 += Sz_lower * Sr_lower * R_m0
-                R1_m0 += Sz_lower * Sr_upper * R_m0
-                R3_m0 += Sz_upper * Sr_lower * R_m0
-                R3_m0 += Sz_upper * Sr_upper * R_m0
-                # -----------------------------
-                R1_m1 += Sz_lower * Sr_lower * R_m1
-                R1_m1 += Sz_lower * Sr_upper * R_m1
-                R3_m1 += Sz_upper * Sr_lower * R_m1
-                R3_m1 += Sz_upper * Sr_upper * R_m1
-                # -----------------------------
-            if ir_lower != ir_upper:
-                # In the case that ir_lower and ir_upper are different,
-                # add the current to the four arrays according to
-                # the direction.
-                R1_m0 += Sz_lower * Sr_lower * R_m0
-                R2_m0 += Sz_lower * Sr_upper * R_m0
-                R3_m0 += Sz_upper * Sr_lower * R_m0
-                R4_m0 += Sz_upper * Sr_upper * R_m0
-                # -----------------------------
-                R1_m1 += Sz_lower * Sr_lower * R_m1
-                R2_m1 += Sz_lower * Sr_upper * R_m1
-                R3_m1 += Sz_upper * Sr_lower * R_m1
-                R4_m1 += Sz_upper * Sr_upper * R_m1
-                # -----------------------------
-            if ir_lower == ir_upper == 0:
-                # Treat the guard cells.
-                # Add the current to the guard cells
-                # for particles that had an original
-                # cell index < 0.
-                R1_m0 += -1. * Sz_lower * Sr_guard * R_m0
-                R3_m0 += -1. * Sz_upper * Sr_guard * R_m0
-                # ---------------------------------
-                R1_m1 += -1. * Sz_lower * Sr_guard * R_m1
-                R3_m1 += -1. * Sz_upper * Sr_guard * R_m1
-        # Write the calculated field values to
-        # the field arrays defined on the interpolation grid
-        rho0[iz, ir, 0] = R1_m0
-        rho0[iz, ir, 1] = R1_m1
-        rho1[iz, ir, 0] = R2_m0
-        rho1[iz, ir, 1] = R2_m1
-        rho2[iz, ir, 0] = R3_m0
-        rho2[iz, ir, 1] = R3_m1
-        rho3[iz, ir, 0] = R4_m0
-        rho3[iz, ir, 1] = R4_m1
-
-
-@cuda.jit('void(complex128[:,:], complex128[:,:], \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:])')
-def add_rho(rho_m0, rho_m1,
-            rho0, rho1,
-            rho2, rho3):
-    """
-    Merges the 4 separate field arrays that contain rho for
-    each deposition direction and adds them to the global
-    interpolation grid arrays for mode 0 and 1.
-
-    Parameters
-    ----------
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
-
-    rho0, rho1, rho2, rho3 : 3darrays of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes.
-    """
-    # Get the CUDA Grid in 2D
-    i, j = cuda.grid(2)
-    # Only for threads within (nz, nr)
-    if (i < rho_m0.shape[0] and j < rho_m0.shape[1]):
-        # Sum the four field arrays for the different deposition
-        # directions and write them to the global field array
-        rho_m0[i, j] += rho0[i, j, 0] + \
-            rho1[i, j - 1, 0] + \
-            rho2[i - 1, j, 0] + \
-            rho3[i - 1, j - 1, 0]
-
-        rho_m1[i, j] += rho0[i, j, 1] + \
-            rho1[i, j - 1, 1] + \
-            rho2[i - 1, j, 1] + \
-            rho3[i - 1, j - 1, 1]
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:],\
-                int32[:], int32[:])')
-def deposit_J_gpu(x, y, z, w,
-                  ux, uy, uz, inv_gamma,
-                  invdz, zmin, Nz,
-                  invdr, rmin, Nr,
-                  J0, J1,
-                  J2, J3,
-                  cell_idx, prefix_sum):
-    """
-    Deposition of the current J using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of J that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 arrays (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    J0, J1, J2, J3 : 3darray of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes and the
-        3 directions of J in cylindrical coordinates (r, t, z).
-        (is mofidied by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz = int(i / Nr)
-        ir = int(i - iz * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-        # Initialize the local field value for
-        # all four possible deposition directions
-        # Mode 0, 1 for r, t, z
-        # 1 : lower in r, lower in z
-        # 2 : lower in r, upper in z
-        # 3 : upper in r, lower in z
-        # 4 : upper in r, upper in z
-        Jr1_m0 = 0. + 0.j
-        Jr2_m0 = 0. + 0.j
-        Jr3_m0 = 0. + 0.j
-        Jr4_m0 = 0. + 0.j
-        # -------------
-        Jr1_m1 = 0. + 0.j
-        Jr2_m1 = 0. + 0.j
-        Jr3_m1 = 0. + 0.j
-        Jr4_m1 = 0. + 0.j
-        # -------------
-        Jt1_m0 = 0. + 0.j
-        Jt2_m0 = 0. + 0.j
-        Jt3_m0 = 0. + 0.j
-        Jt4_m0 = 0. + 0.j
-        # -------------
-        Jt1_m1 = 0. + 0.j
-        Jt2_m1 = 0. + 0.j
-        Jt3_m1 = 0. + 0.j
-        Jt4_m1 = 0. + 0.j
-        # -------------
-        Jz1_m0 = 0. + 0.j
-        Jz2_m0 = 0. + 0.j
-        Jz3_m0 = 0. + 0.j
-        Jz4_m0 = 0. + 0.j
-        # -------------
-        Jz1_m1 = 0. + 0.j
-        Jz2_m1 = 0. + 0.j
-        Jz3_m1 = 0. + 0.j
-        Jz4_m1 = 0. + 0.j
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index
-            # ----------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset - 1 - j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1. / rj
-                cos = xj * invr  # Cosine
-                sin = yj * invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j * sin
-
-            # Get linear weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr * (rj - rmin) - 0.5
-            z_cell = invdz * (zj - zmin) - 0.5
-            # Original index of the uppper and lower cell
-            # in r and z
-            ir_lower = int(math.floor(r_cell))
-            ir_upper = ir_lower + 1
-            iz_lower = int(math.floor(z_cell))
-            iz_upper = iz_lower + 1
-            # Linear weight
-            Sr_lower = ir_upper - r_cell
-            Sr_upper = r_cell - ir_lower
-            Sz_lower = iz_upper - z_cell
-            Sz_upper = z_cell - iz_lower
-            # Set guard weights to zero
-            Sr_guard = 0.
-
-            # Treat the boundary conditions
-            # --------------------------------------------
-            # guard cells in lower r
-            if ir_lower < 0:
-                Sr_guard = Sr_lower
-                Sr_lower = 0.
-                ir_lower = 0
-            # absorbing in upper r
-            if ir_lower > Nr - 1:
-                ir_lower = Nr - 1
-            if ir_upper > Nr - 1:
-                ir_upper = Nr - 1
-            # periodic boundaries in z
-            # lower z boundaries
-            if iz_lower < 0:
-                iz_lower += Nz
-            if iz_upper < 0:
-                iz_upper += Nz
-            # upper z boundaries
-            if iz_lower > Nz - 1:
-                iz_lower -= Nz
-            if iz_upper > Nz - 1:
-                iz_upper -= Nz
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            Jr_m0 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m0
-            Jt_m0 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m0
-            Jz_m0 = wj * c * inv_gammaj * uzj * exptheta_m0
-            # Mode 1
-            Jr_m1 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m1
-            Jt_m1 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m1
-            Jz_m1 = wj * c * inv_gammaj * uzj * exptheta_m1
-
-            # Caculate the weighted currents for each
-            # of the four possible direction
-            # --------------------------------------------
-            if ir_lower == ir_upper:
-                # In the case that ir_lower and ir_upper are equal,
-                # the current is added only to the array corresponding
-                # to ir_lower.
-                # (This is the case for the boundaries in r)
-                Jr1_m0 += Sz_lower * Sr_lower * Jr_m0
-                Jr1_m0 += Sz_lower * Sr_upper * Jr_m0
-                Jr3_m0 += Sz_upper * Sr_lower * Jr_m0
-                Jr3_m0 += Sz_upper * Sr_upper * Jr_m0
-                # -------------------------------
-                Jr1_m1 += Sz_lower * Sr_lower * Jr_m1
-                Jr1_m1 += Sz_lower * Sr_upper * Jr_m1
-                Jr3_m1 += Sz_upper * Sr_lower * Jr_m1
-                Jr3_m1 += Sz_upper * Sr_upper * Jr_m1
-                # -------------------------------
-                Jt1_m0 += Sz_lower * Sr_lower * Jt_m0
-                Jt1_m0 += Sz_lower * Sr_upper * Jt_m0
-                Jt3_m0 += Sz_upper * Sr_lower * Jt_m0
-                Jt3_m0 += Sz_upper * Sr_upper * Jt_m0
-                # -------------------------------
-                Jt1_m1 += Sz_lower * Sr_lower * Jt_m1
-                Jt1_m1 += Sz_lower * Sr_upper * Jt_m1
-                Jt3_m1 += Sz_upper * Sr_lower * Jt_m1
-                Jt3_m1 += Sz_upper * Sr_upper * Jt_m1
-                # -------------------------------
-                Jz1_m0 += Sz_lower * Sr_lower * Jz_m0
-                Jz1_m0 += Sz_lower * Sr_upper * Jz_m0
-                Jz3_m0 += Sz_upper * Sr_lower * Jz_m0
-                Jz3_m0 += Sz_upper * Sr_upper * Jz_m0
-                # -------------------------------
-                Jz1_m1 += Sz_lower * Sr_lower * Jz_m1
-                Jz1_m1 += Sz_lower * Sr_upper * Jz_m1
-                Jz3_m1 += Sz_upper * Sr_lower * Jz_m1
-                Jz3_m1 += Sz_upper * Sr_upper * Jz_m1
-                # -------------------------------
-            if ir_lower != ir_upper:
-                # In the case that ir_lower and ir_upper are different,
-                # add the current to the four arrays according to
-                # the direction.
-                Jr1_m0 += Sz_lower * Sr_lower * Jr_m0
-                Jr2_m0 += Sz_lower * Sr_upper * Jr_m0
-                Jr3_m0 += Sz_upper * Sr_lower * Jr_m0
-                Jr4_m0 += Sz_upper * Sr_upper * Jr_m0
-                # -------------------------------
-                Jr1_m1 += Sz_lower * Sr_lower * Jr_m1
-                Jr2_m1 += Sz_lower * Sr_upper * Jr_m1
-                Jr3_m1 += Sz_upper * Sr_lower * Jr_m1
-                Jr4_m1 += Sz_upper * Sr_upper * Jr_m1
-                # -------------------------------
-                Jt1_m0 += Sz_lower * Sr_lower * Jt_m0
-                Jt2_m0 += Sz_lower * Sr_upper * Jt_m0
-                Jt3_m0 += Sz_upper * Sr_lower * Jt_m0
-                Jt4_m0 += Sz_upper * Sr_upper * Jt_m0
-                # -------------------------------
-                Jt1_m1 += Sz_lower * Sr_lower * Jt_m1
-                Jt2_m1 += Sz_lower * Sr_upper * Jt_m1
-                Jt3_m1 += Sz_upper * Sr_lower * Jt_m1
-                Jt4_m1 += Sz_upper * Sr_upper * Jt_m1
-                # -------------------------------
-                Jz1_m0 += Sz_lower * Sr_lower * Jz_m0
-                Jz2_m0 += Sz_lower * Sr_upper * Jz_m0
-                Jz3_m0 += Sz_upper * Sr_lower * Jz_m0
-                Jz4_m0 += Sz_upper * Sr_upper * Jz_m0
-                # -------------------------------
-                Jz1_m1 += Sz_lower * Sr_lower * Jz_m1
-                Jz2_m1 += Sz_lower * Sr_upper * Jz_m1
-                Jz3_m1 += Sz_upper * Sr_lower * Jz_m1
-                Jz4_m1 += Sz_upper * Sr_upper * Jz_m1
-                # -------------------------------
-            if ir_lower == ir_upper == 0:
-                # Treat the guard cells.
-                # Add the current to the guard cells
-                # for particles that had an original
-                # cell index < 0.
-                Jr1_m0 += -1. * Sz_lower * Sr_guard * Jr_m0
-                Jr3_m0 += -1. * Sz_upper * Sr_guard * Jr_m0
-                # -----------------------------------
-                Jr1_m1 += -1. * Sz_lower * Sr_guard * Jr_m1
-                Jr3_m1 += -1. * Sz_upper * Sr_guard * Jr_m1
-                # -----------------------------------
-                Jt1_m0 += -1. * Sz_lower * Sr_guard * Jt_m0
-                Jt3_m0 += -1. * Sz_upper * Sr_guard * Jt_m0
-                # -----------------------------------
-                Jt1_m1 += -1. * Sz_lower * Sr_guard * Jt_m1
-                Jt3_m1 += -1. * Sz_upper * Sr_guard * Jt_m1
-                # -----------------------------------
-                Jz1_m0 += -1. * Sz_lower * Sr_guard * Jz_m0
-                Jz3_m0 += -1. * Sz_upper * Sr_guard * Jz_m0
-                # -----------------------------------
-                Jz1_m1 += -1. * Sz_lower * Sr_guard * Jz_m1
-                Jz3_m1 += -1. * Sz_upper * Sr_guard * Jz_m1
-        # Write the calculated field values to
-        # the field arrays defined on the interpolation grid
-        J0[iz, ir, 0] = Jr1_m0
-        J0[iz, ir, 1] = Jr1_m1
-        J0[iz, ir, 2] = Jt1_m0
-        J0[iz, ir, 3] = Jt1_m1
-        J0[iz, ir, 4] = Jz1_m0
-        J0[iz, ir, 5] = Jz1_m1
-        # --------------------
-        J1[iz, ir, 0] = Jr2_m0
-        J1[iz, ir, 1] = Jr2_m1
-        J1[iz, ir, 2] = Jt2_m0
-        J1[iz, ir, 3] = Jt2_m1
-        J1[iz, ir, 4] = Jz2_m0
-        J1[iz, ir, 5] = Jz2_m1
-        # --------------------
-        J2[iz, ir, 0] = Jr3_m0
-        J2[iz, ir, 1] = Jr3_m1
-        J2[iz, ir, 2] = Jt3_m0
-        J2[iz, ir, 3] = Jt3_m1
-        J2[iz, ir, 4] = Jz3_m0
-        J2[iz, ir, 5] = Jz3_m1
-        # --------------------
-        J3[iz, ir, 0] = Jr4_m0
-        J3[iz, ir, 1] = Jr4_m1
-        J3[iz, ir, 2] = Jt4_m0
-        J3[iz, ir, 3] = Jt4_m1
-        J3[iz, ir, 4] = Jz4_m0
-        J3[iz, ir, 5] = Jz4_m1
-
-
-@cuda.jit('void(complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:])')
-def add_J(Jr_m0, Jr_m1,
-          Jt_m0, Jt_m1,
-          Jz_m0, Jz_m1,
-          J0, J1,
-          J2, J3):
-    """
-    Merges the 4 separate field arrays that contain J for
-    each deposition direction and adds them to the global
-    interpolation grid arrays for mode 0 and 1.
-
-    Parameters
-    ----------
-    Jr_m0, Jr_m1, Jt_m0, Jt_m1, Jz_m0, Jz_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    J0, J1, J2, J3 : 3darrays of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes and
-        the 3 different components of J (r, t, z).
-    """
-    # Get the CUDA Grid in 2D
-    i, j = cuda.grid(2)
-    # Only for threads within (nz, nr)
-    if (i < Jr_m0.shape[0] and j < Jr_m0.shape[1]):
-        # Sum the four field arrays for the different deposition
-        # directions and write them to the global field array
-        Jr_m0[i, j] += J0[i, j, 0] + \
-            J1[i, j - 1, 0] + \
-            J2[i - 1, j, 0] + \
-            J3[i - 1, j - 1, 0]
-
-        Jr_m1[i, j] += J0[i, j, 1] + \
-            J1[i, j - 1, 1] + \
-            J2[i - 1, j, 1] + \
-            J3[i - 1, j - 1, 1]
-
-        Jt_m0[i, j] += J0[i, j, 2] + \
-            J1[i, j - 1, 2] + \
-            J2[i - 1, j, 2] + \
-            J3[i - 1, j - 1, 2]
-
-        Jt_m1[i, j] += J0[i, j, 3] + \
-            J1[i, j - 1, 3] + \
-            J2[i - 1, j, 3] + \
-            J3[i - 1, j - 1, 3]
-
-        Jz_m0[i, j] += J0[i, j, 4] + \
-            J1[i, j - 1, 4] + \
-            J2[i - 1, j, 4] + \
-            J3[i - 1, j - 1, 4]
-
-        Jz_m1[i, j] += J0[i, j, 5] + \
-            J1[i, j - 1, 5] + \
-            J2[i - 1, j, 5] + \
-            J3[i - 1, j - 1, 5]
diff --git a/fbpic/particles/cuda_deposition/__init__.py b/fbpic/particles/deposition/__init__.py
similarity index 100%
rename from fbpic/particles/cuda_deposition/__init__.py
rename to fbpic/particles/deposition/__init__.py
diff --git a/fbpic/particles/deposition/cuda_methods.py b/fbpic/particles/deposition/cuda_methods.py
new file mode 100644
index 000000000..2686479bf
--- /dev/null
+++ b/fbpic/particles/deposition/cuda_methods.py
@@ -0,0 +1,1683 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the GPU using CUDA.
+"""
+from numba import cuda, int64
+import math
+from scipy.constants import c
+import numpy as np
+
+# -------------------------------
+# Particle shape Factor functions 
+# -------------------------------
+
+# Linear shapes
+@cuda.jit(device=True, inline=True)
+def z_shape_linear(cell_position, index):
+    iz = int64(math.floor(cell_position))
+    if index == 0:
+        return iz+1.-cell_position
+    if index == 1:
+        return cell_position - iz
+
+@cuda.jit(device=True, inline=True)
+def r_shape_linear(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position))
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(ir+1.-cell_position)
+    if index == 1:
+        return flip_factor*(cell_position - ir)
+
+# Cubic shapes
+@cuda.jit(device=True, inline=True)
+def z_shape_cubic(cell_position, index):
+    iz = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        return (-1./6.)*((cell_position-iz)-2)**3
+    if index == 1:
+        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
+    if index == 2:
+        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
+    if index == 3:
+        return (-1./6.)*(((iz+3)-cell_position)-2)**3
+
+@cuda.jit(device=True, inline=True)
+def r_shape_cubic(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
+    if index == 1:
+        if ir+1 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
+    if index == 2:
+        if ir+2 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
+    if index == 3:
+        if ir+3 < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
+
+# -------------------------------
+# Field deposition - linear - rho
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                int32[:], int32[:])')
+def deposit_rho_gpu_linear(x, y, z, w,
+                           invdz, zmin, Nz,
+                           invdr, rmin, Nr,
+                           rho_m0, rho_m1,
+                           cell_idx, prefix_sum):
+    """
+    Deposition of the charge density rho using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of rho that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 4 variables (one for each possible direction,
+    e.g. upper in z, lower in r) to maintain parallelism while
+    avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0, rho_m1 : 2darrays of complexs
+        The charge density on the interpolation grid for
+        mode 0 and 1. (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i / Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare local field arrays
+        R_m0_00 = 0.
+        R_m0_01 = 0.
+        R_m0_10 = 0.
+        R_m0_11 = 0.
+
+        R_m1_00 = 0. + 0.j
+        R_m1_01 = 0. + 0.j
+        R_m1_10 = 0. + 0.j
+        R_m1_11 = 0. + 0.j
+
+        for j in range(frequency_per_cell):
+            # Get the particle index before the sorting
+            # --------------------------------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Boundary Region Shifts
+            ir_lower = int64(math.floor(r_cell))
+
+            R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
+            R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
+            R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
+            R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            if ir_lower == -1:
+                R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+            else:
+                R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+        # Cell shifts for the simulation boundaries
+        shift_r = 0
+        shift_z = 0
+        if ir_cell+1 > (Nr-1):
+            shift_r = -1
+        if iz_cell+1 > Nz-1:
+            shift_z -= Nz
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag)
+
+
+# -------------------------------
+# Field deposition - linear - J
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:],\
+                int32[:], int32[:])')
+def deposit_J_gpu_linear(x, y, z, w,
+                         ux, uy, uz, inv_gamma,
+                         invdz, zmin, Nz,
+                         invdr, rmin, Nr,
+                         j_r_m0, j_r_m1,
+                         j_t_m0, j_t_m1,
+                         j_z_m0, j_z_m1,
+                         cell_idx, prefix_sum):
+    """
+    Deposition of the current J using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of J that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 4 variables (one for each possible direction,
+    e.g. upper in z, lower in r) to maintain parallelism while
+    avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
+        The current component in each direction (r, t, z)
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i/Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare the local field value for
+        # all possible deposition directions,
+        # depending on the shape order and per mode for r,t and z.
+
+        J_r_m0_00 = 0.
+        J_r_m1_00 = 0. + 0.j
+        J_t_m0_00 = 0.# + 0.j
+        J_t_m1_00 = 0. + 0.j
+        J_z_m0_00 = 0.
+        J_z_m1_00 = 0. + 0.j
+
+        J_r_m0_01 = 0.
+        J_r_m1_01 = 0. + 0.j
+        J_t_m0_01 = 0.
+        J_t_m1_01 = 0. + 0.j
+        J_z_m0_01 = 0.
+        J_z_m1_01 = 0. + 0.j
+
+        J_r_m0_10 = 0.
+        J_r_m1_10 = 0. + 0.j
+        J_t_m0_10 = 0.
+        J_t_m1_10 = 0. + 0.j
+        J_z_m0_10 = 0.
+        J_z_m1_10 = 0. + 0.j
+
+        J_r_m0_11 = 0.
+        J_r_m1_11 = 0. + 0.j
+        J_t_m0_11 = 0.
+        J_t_m1_11 = 0. + 0.j
+        J_z_m0_11 = 0.
+        J_z_m1_11 = 0. + 0.j
+
+
+        # Loop over the number of particles per cell
+        for j in range(frequency_per_cell):
+            # Get the particle index
+            # ----------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Deposit on local copies at respective position
+            ir_lower = int64(math.floor(r_cell))
+
+            J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+            J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+            J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+            J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+            J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+            J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+            J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+            J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+            J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+            J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+            J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+            J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Take into account lower r flips
+            if ir_lower == -1:
+                J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+            else:
+                J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+        # Cell shifts for the simulation boundaries
+        shift_r = 0
+        shift_z = 0
+        if (ir_cell+1) > (Nr-1):
+            shift_r = -1
+        if (iz_cell+1) > Nz-1:
+            shift_z -= Nz
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag)
+
+# -------------------------------
+# Field deposition - cubic - rho
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                int32[:], int32[:])')
+def deposit_rho_gpu_cubic(x, y, z, w,
+                          invdz, zmin, Nz,
+                          invdr, rmin, Nr,
+                          rho_m0, rho_m1,
+                          cell_idx, prefix_sum):
+    """
+    Deposition of the charge density rho using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of rho that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 16 variables (one for each surrounding cell) to
+    maintain parallelism while avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0, rho_m1 : 2darrays of complexs
+        The charge density on the interpolation grid for
+        mode 0 and 1. (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i / Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare local field arrays
+        R_m0_00 = 0.
+        R_m1_00 = 0. + 0.j
+
+        R_m0_01 = 0.
+        R_m1_01 = 0. + 0.j
+
+        R_m0_02 = 0.
+        R_m1_02 = 0. + 0.j
+
+        R_m0_03 = 0.
+        R_m1_03 = 0. + 0.j
+
+        R_m0_10 = 0.
+        R_m1_10 = 0. + 0.j
+
+        R_m0_11 = 0.
+        R_m1_11 = 0. + 0.j
+
+        R_m0_12 = 0.
+        R_m1_12 = 0. + 0.j
+
+        R_m0_13 = 0.
+        R_m1_13 = 0. + 0.j
+
+        R_m0_20 = 0.
+        R_m1_20 = 0. + 0.j
+
+        R_m0_21 = 0.
+        R_m1_21 = 0. + 0.j
+
+        R_m0_22 = 0.
+        R_m1_22 = 0. + 0.j
+
+        R_m0_23 = 0.
+        R_m1_23 = 0. + 0.j
+
+        R_m0_30 = 0.
+        R_m1_30 = 0. + 0.j
+
+        R_m0_31 = 0.
+        R_m1_31 = 0. + 0.j
+
+        R_m0_32 = 0.
+        R_m1_32 = 0. + 0.j
+
+        R_m0_33 = 0.
+        R_m1_33 = 0. + 0.j
+
+        for j in range(frequency_per_cell):
+            # Get the particle index before the sorting
+            # --------------------------------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+            # Compute values in local copies and consider boundaries
+            ir0 = int64(math.floor(r_cell)) - 1
+
+            if (ir0 == -2):
+                R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            if (ir0 == -1):
+                R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+            if (ir0 >= 0):
+                R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+        # Index Shifting since local copies are centered around
+        # the current cell
+        srl = 0         # shift r lower
+        sru = 0         # shift r upper inner
+        sru2 = 0        # shift r upper outer
+        szl = 0         # shift z lower
+        szu = 0         # shift z upper inner
+        szu2 = 0        # shift z upper outer
+        if (iz_cell-1) < 0:
+            szl += Nz
+        if (iz_cell) == (Nz - 1):
+            szu -= Nz
+            szu2 -= Nz
+        if (iz_cell+1) == (Nz - 1):
+            szu2 -= Nz
+        if (ir_cell) >= (Nr - 1):
+            sru = -1
+            sru2 = -2
+        if (ir_cell+1) == (Nr - 1):
+            sru2 = -1
+        if (ir_cell-1) < 0:
+            srl = 1
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag)
+
+
+# -------------------------------
+# Field deposition - cubic - J
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:],\
+                int32[:], int32[:])')
+def deposit_J_gpu_cubic(x, y, z, w,
+                        ux, uy, uz, inv_gamma,
+                        invdz, zmin, Nz,
+                        invdr, rmin, Nr,
+                        j_r_m0, j_r_m1,
+                        j_t_m0, j_t_m1,
+                        j_z_m0, j_z_m1,
+                        cell_idx, prefix_sum):
+    """
+    Deposition of the current J using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of J that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 16 variables (one for each cell) to maintain
+    parallelism while avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
+        The current component in each direction (r, t, z)
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i/Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare the local field value for
+        # all possible deposition directions,
+        # depending on the shape order and per mode for r,t and z.
+        J_r_m0_00 = 0.
+        J_t_m0_00 = 0.
+        J_z_m0_00 = 0.
+        J_r_m1_00 = 0. + 0.j
+        J_t_m1_00 = 0. + 0.j
+        J_z_m1_00 = 0. + 0.j
+
+        J_r_m0_01 = 0.
+        J_t_m0_01 = 0.
+        J_z_m0_01 = 0.
+        J_r_m1_01 = 0. + 0.j
+        J_t_m1_01 = 0. + 0.j
+        J_z_m1_01 = 0. + 0.j
+
+        J_r_m0_02 = 0.
+        J_t_m0_02 = 0.
+        J_z_m0_02 = 0.
+        J_r_m1_02 = 0. + 0.j
+        J_t_m1_02 = 0. + 0.j
+        J_z_m1_02 = 0. + 0.j
+
+        J_r_m0_03 = 0.
+        J_t_m0_03 = 0.
+        J_z_m0_03 = 0.
+        J_r_m1_03 = 0. + 0.j
+        J_t_m1_03 = 0. + 0.j
+        J_z_m1_03 = 0. + 0.j
+
+        J_r_m0_10 = 0.
+        J_t_m0_10 = 0.
+        J_z_m0_10 = 0.
+        J_r_m1_10 = 0. + 0.j
+        J_t_m1_10 = 0. + 0.j
+        J_z_m1_10 = 0. + 0.j
+
+        J_r_m0_11 = 0.
+        J_t_m0_11 = 0.
+        J_z_m0_11 = 0.
+        J_r_m1_11 = 0. + 0.j
+        J_t_m1_11 = 0. + 0.j
+        J_z_m1_11 = 0. + 0.j
+
+        J_r_m0_12 = 0.
+        J_t_m0_12 = 0.
+        J_z_m0_12 = 0.
+        J_r_m1_12 = 0. + 0.j
+        J_t_m1_12 = 0. + 0.j
+        J_z_m1_12 = 0. + 0.j
+
+        J_r_m0_13 = 0.
+        J_t_m0_13 = 0.
+        J_z_m0_13 = 0.
+        J_r_m1_13 = 0. + 0.j
+        J_t_m1_13 = 0. + 0.j
+        J_z_m1_13 = 0. + 0.j
+
+        J_r_m0_20 = 0.
+        J_t_m0_20 = 0.
+        J_z_m0_20 = 0.
+        J_r_m1_20 = 0. + 0.j
+        J_t_m1_20 = 0. + 0.j
+        J_z_m1_20 = 0. + 0.j
+
+        J_r_m0_21 = 0.
+        J_t_m0_21 = 0.
+        J_z_m0_21 = 0.
+        J_r_m1_21 = 0. + 0.j
+        J_t_m1_21 = 0. + 0.j
+        J_z_m1_21 = 0. + 0.j
+
+        J_r_m0_22 = 0.
+        J_t_m0_22 = 0.
+        J_z_m0_22 = 0.
+        J_r_m1_22 = 0. + 0.j
+        J_t_m1_22 = 0. + 0.j
+        J_z_m1_22 = 0. + 0.j
+
+        J_r_m0_23 = 0.
+        J_t_m0_23 = 0.
+        J_z_m0_23 = 0.
+        J_r_m1_23 = 0. + 0.j
+        J_t_m1_23 = 0. + 0.j
+        J_z_m1_23 = 0. + 0.j
+
+        J_r_m0_30 = 0.
+        J_t_m0_30 = 0.
+        J_z_m0_30 = 0.
+        J_r_m1_30 = 0. + 0.j
+        J_t_m1_30 = 0. + 0.j
+        J_z_m1_30 = 0. + 0.j
+
+        J_r_m0_31 = 0.
+        J_t_m0_31 = 0.
+        J_z_m0_31 = 0.
+        J_r_m1_31 = 0. + 0.j
+        J_t_m1_31 = 0. + 0.j
+        J_z_m1_31 = 0. + 0.j
+
+        J_r_m0_32 = 0.
+        J_t_m0_32 = 0.
+        J_z_m0_32 = 0.
+        J_r_m1_32 = 0. + 0.j
+        J_t_m1_32 = 0. + 0.j
+        J_z_m1_32 = 0. + 0.j
+
+        J_r_m0_33 = 0.
+        J_t_m0_33 = 0.
+        J_z_m0_33 = 0.
+        J_r_m1_33 = 0. + 0.j
+        J_t_m1_33 = 0. + 0.j
+        J_z_m1_33 = 0. + 0.j
+
+        # Loop over the number of particles per cell
+        for j in range(frequency_per_cell):
+            # Get the particle index
+            # ----------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Compute values in local copies and consider boundaries
+            ir0 = int64(math.floor(r_cell)) - 1
+
+            if (ir0 == -2):
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+
+                J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir0 == -1):
+                J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir0 >= 0):
+                J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+        # Index Shifting since local copies are centered around
+        # the current cell
+        srl = 0         # shift r lower
+        sru = 0         # shift r upper inner
+        sru2 = 0        # shift r upper outer
+        szl = 0         # shift z lower
+        szu = 0         # shift z upper inner
+        szu2 = 0        # shift z upper outer
+        if (iz_cell-1) < 0:
+            szl += Nz
+        if (iz_cell) == (Nz - 1):
+            szu -= Nz
+            szu2 -= Nz
+        if (iz_cell+1) == (Nz - 1):
+            szu2 -= Nz
+        if (ir_cell) >= (Nr - 1):
+            sru = -1
+            sru2 = -2
+        if (ir_cell+1) == (Nr - 1):
+            sru2 = -1
+        if (ir_cell-1) < 0:
+            srl = 1
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag)
+
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag)
diff --git a/fbpic/particles/deposition/numba_methods.py b/fbpic/particles/deposition/numba_methods.py
new file mode 100644
index 000000000..54fedeb39
--- /dev/null
+++ b/fbpic/particles/deposition/numba_methods.py
@@ -0,0 +1,65 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the CPU with numba.
+"""
+import numba
+
+@numba.njit
+def deposit_field_numba(Fptcl, Fgrid,
+        iz, ir, Sz, Sr, sign_guards):
+    """
+    Perform the deposition using numba
+
+    Parameters
+    ----------
+    Fptcl : 1darray of complexs
+        (one element per macroparticle)
+        Contains the charge or current for each macroparticle (already
+        multiplied by exp(im theta), from which to do the deposition
+
+    Fgrid : 2darray of complexs
+        Contains the fields on the interpolation grid.
+        Is modified by this function
+
+    iz, ir : 2darray of ints
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the index of the cells that each macroparticle
+        will deposit to.
+
+    Sz, Sr: 2darray of floats
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the weight for respective cells from iz and ir,
+        for each macroparticle.
+
+    sign_guards : float
+       The sign (+1 or -1) with which the weight of the guard cells should
+       be added to the 0th cell.
+    """
+
+    # Get the total number of particles
+    Ntot = len(Fptcl)
+
+    # Loop over all particles
+    for ip in range(Ntot):
+        # Loop over adjacent cells (given by shape order)
+        # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid
+        # modifying ir and Sr in place. (This is not strictly necessary,
+        # but is just here as a safeguard.)
+        for cell_index_r in range(ir.shape[0]):
+            for cell_index_z in range(iz.shape[0]):
+                # Correct the guard cell index and sign
+                if ir[cell_index_r, ip] < 0:
+                    ir_corr = abs(ir[cell_index_r, ip]) - 1
+                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
+                else:
+                    ir_corr = ir[cell_index_r, ip]
+                    Sr_corr = Sr[cell_index_r, ip]
+                # Deposit field from particle to the respective grid point
+                Fgrid[ iz[cell_index_z, ip], ir_corr ] += \
+                    Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip]
diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
new file mode 100644
index 000000000..4f38862a6
--- /dev/null
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -0,0 +1,1530 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the CPU with threading.
+"""
+import numba
+from numba import prange, int64
+import math
+from scipy.constants import c
+
+# -------------------------------
+# Particle shape Factor functions
+# -------------------------------
+
+# Linear shapes
+@numba.njit
+def z_shape_linear(cell_position, index):
+    iz = int64(math.floor(cell_position))
+    if index == 0:
+        return iz+1.-cell_position
+    if index == 1:
+        return cell_position - iz
+
+@numba.njit
+def r_shape_linear(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position))
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(ir+1.-cell_position)
+    if index == 1:
+        return flip_factor*(cell_position - ir)
+
+# Cubic shapes
+@numba.njit
+def z_shape_cubic(cell_position, index):
+    iz = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        return (-1./6.)*((cell_position-iz)-2)**3
+    if index == 1:
+        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
+    if index == 2:
+        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
+    if index == 3:
+        return (-1./6.)*(((iz+3)-cell_position)-2)**3
+
+@numba.njit
+def r_shape_cubic(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
+    if index == 1:
+        if ir+1 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
+    if index == 2:
+        if ir+2 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
+    if index == 3:
+        if ir+3 < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
+
+# -------------------------------
+# Field deposition - linear - rho
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_rho_prange_linear(x, y, z, w,
+                           invdz, zmin, Nz,
+                           invdr, rmin, Nr,
+                           rho_m0_global, rho_m1_global,
+                           nthreads, ptcl_chunk_indices):
+    """
+    Deposition of the charge density rho using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to a global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of rho that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local charge densities
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for i_thread in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
+            # Weights
+            wj = w[i_ptcl]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Boundary Region Shifts
+            ir_flip = int( math.floor(r_cell) )
+
+            # Declare local field array
+            R_m0_00 = 0.
+            R_m0_01 = 0.
+            R_m0_10 = 0.
+            R_m0_11 = 0.
+
+            R_m1_00 = 0. + 0.j
+            R_m1_01 = 0. + 0.j
+            R_m1_10 = 0. + 0.j
+            R_m1_11 = 0. + 0.j
+
+            R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
+            R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
+            R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
+            R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            if ir_flip == -1:
+                R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+            else:
+                R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            # Cell shifts for the simulation boundaries
+            shift_r = 0
+            shift_z = 0
+            if ir_cell+1 > (Nr-1):
+                shift_r = -1
+            if iz_cell+1 > Nz-1:
+                shift_z -= Nz
+
+            # Write ptcl fields to thread-local part of global deposition array
+            rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_00
+            rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_00
+
+            rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m0_01
+            rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m1_01
+
+            rho_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m0_10
+            rho_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m1_10
+
+            rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
+            rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
+
+    return
+
+# -------------------------------
+# Field deposition - linear - J
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_J_prange_linear(x, y, z, w,
+                         ux, uy, uz, inv_gamma,
+                         invdz, zmin, Nz,
+                         invdr, rmin, Nr,
+                         j_r_m0_global, j_r_m1_global,
+                         j_t_m0_global, j_t_m1_global,
+                         j_z_m0_global, j_z_m1_global,
+                         nthreads, ptcl_chunk_indices):
+    """
+    Deposition of the current density J using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of J that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local current component
+        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for i_thread in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
+            # Velocity
+            uxj = ux[i_ptcl]
+            uyj = uy[i_ptcl]
+            uzj = uz[i_ptcl]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[i_ptcl]
+            # Weights
+            wj = w[i_ptcl]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Boundary Region Shifts
+            ir_flip = int( math.floor(r_cell) )
+
+            # Declare local field arrays
+            J_r_m0_00 = 0.
+            J_r_m1_00 = 0. + 0.j
+            J_t_m0_00 = 0.
+            J_t_m1_00 = 0. + 0.j
+            J_z_m0_00 = 0.
+            J_z_m1_00 = 0. + 0.j
+
+            J_r_m0_01 = 0.
+            J_r_m1_01 = 0. + 0.j
+            J_t_m0_01 = 0.
+            J_t_m1_01 = 0. + 0.j
+            J_z_m0_01 = 0.
+            J_z_m1_01 = 0. + 0.j
+
+            J_r_m0_10 = 0.
+            J_r_m1_10 = 0. + 0.j
+            J_t_m0_10 = 0.
+            J_t_m1_10 = 0. + 0.j
+            J_z_m0_10 = 0.
+            J_z_m1_10 = 0. + 0.j
+
+            J_r_m0_11 = 0.
+            J_r_m1_11 = 0. + 0.j
+            J_t_m0_11 = 0.
+            J_t_m1_11 = 0. + 0.j
+            J_z_m0_11 = 0.
+            J_z_m1_11 = 0. + 0.j
+
+            J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+            J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+            J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+            J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+            J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+            J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+            J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+            J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+            J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+            J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+            J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+            J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Take into account lower r flips
+            if ir_flip == -1:
+                J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+            else:
+                J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Cell shifts for the simulation boundaries
+            shift_r = 0
+            shift_z = 0
+            if (ir_cell+1) > (Nr-1):
+                shift_r = -1
+            if (iz_cell+1) > Nz-1:
+                shift_z -= Nz
+
+            # Write ptcl fields to thread-local part of global deposition array
+            j_r_m0_global[i_thread,iz_cell, ir_cell] += J_r_m0_00
+            j_r_m1_global[i_thread,iz_cell, ir_cell] += J_r_m1_00
+
+            j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
+            j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
+
+            j_r_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
+            j_r_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
+
+            j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
+            j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
+
+            j_t_m0_global[i_thread,iz_cell, ir_cell] += J_t_m0_00
+            j_t_m1_global[i_thread,iz_cell, ir_cell] += J_t_m1_00
+
+            j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
+            j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
+
+            j_t_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
+            j_t_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
+
+            j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
+            j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
+
+            j_z_m0_global[i_thread,iz_cell, ir_cell] += J_z_m0_00
+            j_z_m1_global[i_thread,iz_cell, ir_cell] += J_z_m1_00
+
+            j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
+            j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
+
+            j_z_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
+            j_z_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
+
+            j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
+            j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
+
+    return
+
+
+# -------------------------------
+# Field deposition - cubic - rho
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_rho_prange_cubic(x, y, z, w,
+                          invdz, zmin, Nz,
+                          invdr, rmin, Nr,
+                          rho_m0_global, rho_m1_global,
+                          nthreads, ptcl_chunk_indices):
+    """
+    Deposition of the charge density rho using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of rho that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local charge densities
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for i_thread in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
+            # Weights
+            wj = w[i_ptcl]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Compute values in local copies and consider boundaries
+            ir_flip = int( math.floor(r_cell) ) - 1
+
+            # Declare the local field value for
+            # all possible deposition directions,
+            # depending on the shape order and per mode.
+            R_m0_00 = 0.
+            R_m1_00 = 0. + 0.j
+
+            R_m0_01 = 0.
+            R_m1_01 = 0. + 0.j
+
+            R_m0_02 = 0.
+            R_m1_02 = 0. + 0.j
+
+            R_m0_03 = 0.
+            R_m1_03 = 0. + 0.j
+
+            R_m0_10 = 0.
+            R_m1_10 = 0. + 0.j
+
+            R_m0_11 = 0.
+            R_m1_11 = 0. + 0.j
+
+            R_m0_12 = 0.
+            R_m1_12 = 0. + 0.j
+
+            R_m0_13 = 0.
+            R_m1_13 = 0. + 0.j
+
+            R_m0_20 = 0.
+            R_m1_20 = 0. + 0.j
+
+            R_m0_21 = 0.
+            R_m1_21 = 0. + 0.j
+
+            R_m0_22 = 0.
+            R_m1_22 = 0. + 0.j
+
+            R_m0_23 = 0.
+            R_m1_23 = 0. + 0.j
+
+            R_m0_30 = 0.
+            R_m1_30 = 0. + 0.j
+
+            R_m0_31 = 0.
+            R_m1_31 = 0. + 0.j
+
+            R_m0_32 = 0.
+            R_m1_32 = 0. + 0.j
+
+            R_m0_33 = 0.
+            R_m1_33 = 0. + 0.j
+
+            if (ir_flip == -2):
+                R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            if (ir_flip == -1):
+                R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+            if (ir_flip >= 0):
+                R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            # Index Shifting since local copies are centered around
+            # the current cell
+            srl = 0         # shift r lower
+            sru = 0         # shift r upper inner
+            sru2 = 0        # shift r upper outer
+            szl = 0         # shift z lower
+            szu = 0         # shift z upper inner
+            szu2 = 0        # shift z upper outer
+            if (iz_cell-1) < 0:
+                szl += Nz
+            if (iz_cell) == (Nz - 1):
+                szu -= Nz
+                szu2 -= Nz
+            if (iz_cell+1) == (Nz - 1):
+                szu2 -= Nz
+            if (ir_cell) >= (Nr - 1):
+                sru = -1
+                sru2 = -2
+            if (ir_cell+1) == (Nr - 1):
+                sru2 = -1
+            if (ir_cell-1) < 0:
+                srl = 1
+
+            # Write ptcl fields to thread-local part of global deposition array
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
+            rho_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m0_01
+            rho_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m1_01
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m0_10
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m1_10
+            rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_11
+            rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_11
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m0_12
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m1_12
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m0_13
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m1_13
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
+            rho_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m0_21
+            rho_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m1_21
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
+            rho_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m0_31
+            rho_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m1_31
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
+
+    return
+
+# -------------------------------
+# Field deposition - cubic - J
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_J_prange_cubic(x, y, z, w,
+                        ux, uy, uz, inv_gamma,
+                        invdz, zmin, Nz,
+                        invdr, rmin, Nr,
+                        j_r_m0_global, j_r_m1_global,
+                        j_t_m0_global, j_t_m1_global,
+                        j_z_m0_global, j_z_m1_global,
+                        nthreads, ptcl_chunk_indices):
+    """
+    Deposition of the current density J using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of J that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local current component
+        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for i_thread in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
+            # Velocity
+            uxj = ux[i_ptcl]
+            uyj = uy[i_ptcl]
+            uzj = uz[i_ptcl]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[i_ptcl]
+            # Weights
+            wj = w[i_ptcl]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Compute values in local copies and consider boundaries
+            ir_flip = int64(math.floor(r_cell)) - 1
+
+            # Declare the local field value for
+            # all possible deposition directions,
+            # depending on the shape order and per mode for r,t and z.
+            J_r_m0_00 = 0.
+            J_t_m0_00 = 0.
+            J_z_m0_00 = 0.
+            J_r_m1_00 = 0. + 0.j
+            J_t_m1_00 = 0. + 0.j
+            J_z_m1_00 = 0. + 0.j
+
+            J_r_m0_01 = 0.
+            J_t_m0_01 = 0.
+            J_z_m0_01 = 0.
+            J_r_m1_01 = 0. + 0.j
+            J_t_m1_01 = 0. + 0.j
+            J_z_m1_01 = 0. + 0.j
+
+            J_r_m0_02 = 0.
+            J_t_m0_02 = 0.
+            J_z_m0_02 = 0.
+            J_r_m1_02 = 0. + 0.j
+            J_t_m1_02 = 0. + 0.j
+            J_z_m1_02 = 0. + 0.j
+
+            J_r_m0_03 = 0.
+            J_t_m0_03 = 0.
+            J_z_m0_03 = 0.
+            J_r_m1_03 = 0. + 0.j
+            J_t_m1_03 = 0. + 0.j
+            J_z_m1_03 = 0. + 0.j
+
+            J_r_m0_10 = 0.
+            J_t_m0_10 = 0.
+            J_z_m0_10 = 0.
+            J_r_m1_10 = 0. + 0.j
+            J_t_m1_10 = 0. + 0.j
+            J_z_m1_10 = 0. + 0.j
+
+            J_r_m0_11 = 0.
+            J_t_m0_11 = 0.
+            J_z_m0_11 = 0.
+            J_r_m1_11 = 0. + 0.j
+            J_t_m1_11 = 0. + 0.j
+            J_z_m1_11 = 0. + 0.j
+
+            J_r_m0_12 = 0.
+            J_t_m0_12 = 0.
+            J_z_m0_12 = 0.
+            J_r_m1_12 = 0. + 0.j
+            J_t_m1_12 = 0. + 0.j
+            J_z_m1_12 = 0. + 0.j
+
+            J_r_m0_13 = 0.
+            J_t_m0_13 = 0.
+            J_z_m0_13 = 0.
+            J_r_m1_13 = 0. + 0.j
+            J_t_m1_13 = 0. + 0.j
+            J_z_m1_13 = 0. + 0.j
+
+            J_r_m0_20 = 0.
+            J_t_m0_20 = 0.
+            J_z_m0_20 = 0.
+            J_r_m1_20 = 0. + 0.j
+            J_t_m1_20 = 0. + 0.j
+            J_z_m1_20 = 0. + 0.j
+
+            J_r_m0_21 = 0.
+            J_t_m0_21 = 0.
+            J_z_m0_21 = 0.
+            J_r_m1_21 = 0. + 0.j
+            J_t_m1_21 = 0. + 0.j
+            J_z_m1_21 = 0. + 0.j
+
+            J_r_m0_22 = 0.
+            J_t_m0_22 = 0.
+            J_z_m0_22 = 0.
+            J_r_m1_22 = 0. + 0.j
+            J_t_m1_22 = 0. + 0.j
+            J_z_m1_22 = 0. + 0.j
+
+            J_r_m0_23 = 0.
+            J_t_m0_23 = 0.
+            J_z_m0_23 = 0.
+            J_r_m1_23 = 0. + 0.j
+            J_t_m1_23 = 0. + 0.j
+            J_z_m1_23 = 0. + 0.j
+
+            J_r_m0_30 = 0.
+            J_t_m0_30 = 0.
+            J_z_m0_30 = 0.
+            J_r_m1_30 = 0. + 0.j
+            J_t_m1_30 = 0. + 0.j
+            J_z_m1_30 = 0. + 0.j
+
+            J_r_m0_31 = 0.
+            J_t_m0_31 = 0.
+            J_z_m0_31 = 0.
+            J_r_m1_31 = 0. + 0.j
+            J_t_m1_31 = 0. + 0.j
+            J_z_m1_31 = 0. + 0.j
+
+            J_r_m0_32 = 0.
+            J_t_m0_32 = 0.
+            J_z_m0_32 = 0.
+            J_r_m1_32 = 0. + 0.j
+            J_t_m1_32 = 0. + 0.j
+            J_z_m1_32 = 0. + 0.j
+
+            J_r_m0_33 = 0.
+            J_t_m0_33 = 0.
+            J_z_m0_33 = 0.
+            J_r_m1_33 = 0. + 0.j
+            J_t_m1_33 = 0. + 0.j
+            J_z_m1_33 = 0. + 0.j
+
+            if (ir_flip == -2):
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+
+                J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir_flip == -1):
+                J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir_flip >= 0):
+                J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+            # Index Shifting since local copies are centered around
+            # the current cell
+            srl = 0         # shift r lower
+            sru = 0         # shift r upper inner
+            sru2 = 0        # shift r upper outer
+            szl = 0         # shift z lower
+            szu = 0         # shift z upper inner
+            szu2 = 0        # shift z upper outer
+            if (iz_cell-1) < 0:
+                szl += Nz
+            if (iz_cell) == (Nz - 1):
+                szu -= Nz
+                szu2 -= Nz
+            if (iz_cell+1) == (Nz - 1):
+                szu2 -= Nz
+            if (ir_cell) >= (Nr - 1):
+                sru = -1
+                sru2 = -2
+            if (ir_cell+1) == (Nr - 1):
+                sru2 = -1
+            if (ir_cell-1) < 0:
+                srl = 1
+
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
+            j_r_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m0_01
+            j_r_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m1_01
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
+            j_r_m0_global[i_thread, iz_cell, ir_cell] += J_r_m0_11
+            j_r_m1_global[i_thread, iz_cell, ir_cell] += J_r_m1_11
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m0_12
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m1_12
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
+            j_r_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m0_21
+            j_r_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m1_21
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
+            j_r_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
+            j_r_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
+
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
+            j_t_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m0_01
+            j_t_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m1_01
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
+            j_t_m0_global[i_thread, iz_cell, ir_cell] += J_t_m0_11
+            j_t_m1_global[i_thread, iz_cell, ir_cell] += J_t_m1_11
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m0_12
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m1_12
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
+            j_t_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m0_21
+            j_t_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m1_21
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
+            j_t_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
+            j_t_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
+
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
+            j_z_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m0_01
+            j_z_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m1_01
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
+            j_z_m0_global[i_thread, iz_cell, ir_cell] += J_z_m0_11
+            j_z_m1_global[i_thread, iz_cell, ir_cell] += J_z_m1_11
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m0_12
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m1_12
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
+            j_z_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m0_21
+            j_z_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m1_21
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
+            j_z_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
+            j_z_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
+
+    return
+
+# -----------------------------------------------------------------------
+# Parallel reduction of the global arrays for threads into a single array
+# -----------------------------------------------------------------------
+
+@numba.njit( parallel=True )
+def sum_reduce_2d_array( global_array, reduced_array ):
+    """
+    Sum the array `global_array` along its first axis and
+    add it into `reduced_array`.
+
+    Parameters:
+    -----------
+    global_array: 3darray of complexs
+       Field array whose first dimension corresponds to the
+       reduction dimension (typically: the number of threads used
+       during the current deposition)
+
+    reduced array: 2darray of complexs
+    """
+    # Extract size of each dimension
+    Nreduce, Nz, Nr = global_array.shape
+
+    # Parallel loop over iz
+    for iz in prange( Nz ):
+        # Loop over the reduction dimension (slow dimension)
+        for it in range( Nreduce ):
+            # Loop over ir (fast dimension)
+            for ir in range( Nr ):
+
+                reduced_array[ iz, ir ] +=  global_array[ it, iz, ir ]
diff --git a/fbpic/particles/gathering/__init__.py b/fbpic/particles/gathering/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py
similarity index 62%
rename from fbpic/particles/cuda_methods.py
rename to fbpic/particles/gathering/cuda_methods.py
index dedc1f2e7..12c763860 100644
--- a/fbpic/particles/cuda_methods.py
+++ b/fbpic/particles/gathering/cuda_methods.py
@@ -3,202 +3,14 @@
 # License: 3-Clause-BSD-LBNL
 """
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the optimized particles methods that use cuda on a GPU
+It defines the field gathering methods linear and cubic order shapes 
+on the GPU using CUDA.
 """
 from numba import cuda, float64, int64
-from accelerate.cuda import sorting
 import math
-from scipy.constants import c, e
-import numpy as np
 
 # -----------------------
-# Particle pusher utility
-# -----------------------
-
-@cuda.jit(device=True, inline=True)
-def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
-    Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
-    """
-    Push at single macroparticle, using the Vay pusher
-    """
-    # Get the magnetic rotation vector
-    taux = bconst*Bx
-    tauy = bconst*By
-    tauz = bconst*Bz
-    tau2 = taux**2 + tauy**2 + tauz**2
-
-    # Get the momenta at the half timestep
-    uxp = ux_i + econst*Ex \
-    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
-    uyp = uy_i + econst*Ey \
-    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
-    uzp = uz_i + econst*Ez \
-    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
-    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
-    utau = uxp*taux + uyp*tauy + uzp*tauz
-
-    # Get the new 1./gamma
-    inv_gamma_f = math.sqrt(
-        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
-
-    # Reuse the tau and utau arrays to save memory
-    tx = inv_gamma_f*taux
-    ty = inv_gamma_f*tauy
-    tz = inv_gamma_f*tauz
-    ut = inv_gamma_f*utau
-    s = 1./( 1 + tau2*inv_gamma_f**2 )
-
-    # Get the new u
-    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
-    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
-    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
-
-    return( ux_f, uy_f, uz_f, inv_gamma_f )
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64, float64, int32, float64)')
-def push_p_gpu( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz,
-                q, m, Ntot, dt ) :
-    """
-    Advance the particles' momenta, using cuda on the GPU
-
-    Parameters
-    ----------
-    ux, uy, uz : 1darray of floats
-        The velocity of the particles
-        (is modified by this function)
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-
-    q : float
-        The charge of the particle species
-
-    m : float
-        The mass of the particle species
-
-    Ntot : int
-        The total number of particles
-
-    dt : float
-        The time by which the momenta is advanced
-    """
-    # Set a few constants
-    econst = q*dt/(m*c)
-    bconst = 0.5*q*dt/m
-
-    #Cuda 1D grid
-    ip = cuda.grid(1)
-
-    # Loop over the particles
-    if ip < Ntot:
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64, int32, float64, int16[:])')
-def push_p_ioniz_gpu( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz,
-                m, Ntot, dt, ionization_level ) :
-    """
-    Advance the particles' momenta, using numba on the GPU
-    This take into account that the particles are ionizable, and thus
-    that their charge is determined by `ionization_level`
-
-    Parameters
-    ----------
-    ux, uy, uz : 1darray of floats
-        The velocity of the particles
-        (is modified by this function)
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-
-    m : float
-        The mass of the particle species
-
-    Ntot : int
-        The total number of particles
-
-    dt : float
-        The time by which the momenta is advanced
-
-    ionization_level : 1darray of ints
-        The number of electrons that each ion is missing
-        (compared to a neutral atom)
-    """
-    #Cuda 1D grid
-    ip = cuda.grid(1)
-
-    # Loop over the particles
-    if ip < Ntot:
-        if ionization_level[ip] != 0:
-            # Set a few constants
-            econst = ionization_level[ip] * e * dt/(m*c)
-            bconst = 0.5 * ionization_level[ip] * e * dt/m
-            # Use the Vay pusher
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-                ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-                Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
-
-@cuda.jit('void(float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64)')
-def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) :
-    """
-    Advance the particles' positions over one half-timestep
-
-    This assumes that the positions (x, y, z) are initially either
-    one half-timestep *behind* the momenta (ux, uy, uz), or at the
-    same timestep as the momenta.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-        (is modified by this function)
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    dt : float (seconds)
-        The time by which the position is advanced
-    """
-    # Half timestep, multiplied by c
-    chdt = c*0.5*dt
-
-    i = cuda.grid(1)
-    if i < x.shape[0]:
-        # Particle push
-        inv_g = inv_gamma[i]
-        x[i] += chdt*inv_g*ux[i]
-        y[i] += chdt*inv_g*uy[i]
-        z[i] += chdt*inv_g*uz[i]
-
-# -----------------------
-# Field gathering utility
+# Field gathering linear
 # -----------------------
 
 @cuda.jit('void(float64[:], float64[:], float64[:], \
@@ -521,6 +333,9 @@ def gather_field_gpu_linear(x, y, z,
         By[i] = sin*Fr + cos*Ft
         Bz[i] = Fz
 
+# -----------------------
+# Field gathering cubic
+# -----------------------
 
 @cuda.jit('void(float64[:], float64[:], float64[:], \
             float64, float64, int32, \
@@ -615,8 +430,8 @@ def gather_field_gpu_cubic(x, y, z,
         z_cell = invdz*(zj - zmin) - 0.5
 
         # Calculate the shape factors
-        Sr = cuda.local.array((4,), dtype=float64)
         ir = cuda.local.array((4,), dtype=int64)
+        Sr = cuda.local.array((4,), dtype=float64)
         ir[0] = int64(math.floor(r_cell)) - 1
         ir[1] = ir[0] + 1
         ir[2] = ir[1] + 1
@@ -779,219 +594,4 @@ def gather_field_gpu_cubic(x, y, z,
         # and write to particle field arrays
         Bx[i] = cos*Fr - sin*Ft
         By[i] = sin*Fr + cos*Ft
-        Bz[i] = Fz
-
-# -----------------------------------------------------
-# Sorting utilities - get_cell_idx / sort / prefix_sum
-# -----------------------------------------------------
-
-@cuda.jit('void(int32[:], uint32[:], \
-                float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32)')
-def get_cell_idx_per_particle(cell_idx, sorted_idx,
-                              x, y, z,
-                              invdz, zmin, Nz,
-                              invdr, rmin, Nr):
-    """
-    Get the cell index of each particle.
-    The cell index is 1d and calculated by:
-    cell index in z + cell index in r * number of cells in z.
-    The cell_idx of a particle is defined by
-    the lower cell in r and z, that it deposits its field to.
-
-    Parameters
-    ----------
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    sorted_idx : 1darray of integers
-        The sorted index array needs to be reset
-        before doing the sort
-
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box, in each direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-    """
-    i = cuda.grid(1)
-    if i < cell_idx.shape[0]:
-            # Preliminary arrays for the cylindrical conversion
-            xj = x[i]
-            yj = y[i]
-            zj = z[i]
-            rj = math.sqrt( xj**2 + yj**2 )
-
-            # Positions of the particles, in the cell unit
-            r_cell =  invdr*(rj - rmin) - 0.5
-            z_cell =  invdz*(zj - zmin) - 0.5
-
-            # Original index of the uppper and lower cell
-            ir_lower = int(math.floor( r_cell ))
-            iz_lower = int(math.floor( z_cell ))
-
-            # Treat the boundary conditions
-            # guard cells in lower r
-            if ir_lower < 0:
-                ir_lower = 0
-            # absorbing in upper r
-            if ir_lower > Nr-1:
-                ir_lower = Nr-1
-            # periodic boundaries in z
-            if iz_lower < 0:
-                iz_lower += Nz
-            if iz_lower > Nz-1:
-                iz_lower -= Nz
-
-            # Reset sorted_idx array
-            sorted_idx[i] = i
-            # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr
-            cell_idx[i] = ir_lower + iz_lower * Nr
-
-def sort_particles_per_cell(cell_idx, sorted_idx):
-    """
-    Sort the cell index of the particles and
-    modify the sorted index array accordingly.
-
-    Parameters
-    ----------
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    sorted_idx : 1darray of integers
-        Represents the original index of the
-        particle before the sorting.
-    """
-    Ntot = cell_idx.shape[0]
-    if Ntot > 0:
-        sorter = sorting.RadixSort(Ntot, dtype = np.int32)
-        sorter.sort(cell_idx, vals = sorted_idx)
-
-@cuda.jit('void(int32[:], int32[:])')
-def incl_prefix_sum(cell_idx, prefix_sum):
-    """
-    Perform an inclusive parallel prefix sum on the sorted
-    cell index array. The prefix sum array represents the
-    cumulative sum of the number of particles per cell
-    for each cell index.
-
-    Parameters
-    ----------
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # i is the index of the macroparticle
-    i = cuda.grid(1)
-    if i < cell_idx.shape[0]-1:
-        # ci: index of the cell of the present macroparticle
-        ci = cell_idx[i]
-        # ci_next: index of the cell of the next macroparticle
-        ci_next = cell_idx[i+1]
-        # Fill all the cells between ci and ci_next with the
-        # inclusive cumulative sum of the number particles until ci
-        while ci < ci_next:
-            # The cumulative sum of the number of particle per cell
-            # until ci is i+1 (since i obeys python index, starting at 0)
-            prefix_sum[ci] = i+1
-            ci += 1
-    # The last "macroparticle" of the cell_idx array fills up the
-    # rest of the prefix sum array
-    if i == cell_idx.shape[0]-1:
-        # Get the cell_index of the last macroparticle
-        ci = cell_idx[i]
-        # Fill all the remaining entries of the prefix sum array
-        for empty_index in range(ci, prefix_sum.shape[0]):
-            prefix_sum[empty_index] = i+1
-
-@cuda.jit('void(int32[:])')
-def reset_prefix_sum(prefix_sum):
-    """
-    Resets the prefix sum. Sets all the values
-    to zero.
-
-    Parameters
-    ----------
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    i = cuda.grid(1)
-    if i < prefix_sum.shape[0]:
-        prefix_sum[i] = 0
-
-@cuda.jit('void(uint32[:], float64[:], float64[:])')
-def write_sorting_buffer(sorted_idx, val, buf):
-    """
-    Writes the values of a particle array to a buffer,
-    while rearranging them to match the sorted cell index array.
-
-    Parameters
-    ----------
-    sorted_idx : 1darray of integers
-        Represents the original index of the
-        particle before the sorting
-
-    val : 1d array of floats
-        A particle data array
-
-    buf : 1d array of floats
-        A buffer array to temporarily store the
-        sorted particle data array
-    """
-    i = cuda.grid(1)
-    if i < val.shape[0]:
-        buf[i] = val[sorted_idx[i]]
-
-# -----------------------------------------------------
-# Device array creation utility (will be removed later)
-# -----------------------------------------------------
-
-def cuda_deposition_arrays(Nz = None, Nr = None, fieldtype = None):
-    """
-    Create empty arrays on the GPU for the charge and
-    current deposition in each of the 4 possible direction.
-
-    ###########################################
-    # Needs to be moved to the fields package!
-    ###########################################
-
-    Parameters
-    ----------
-    Nz : int
-        Number of cells in z.
-    Nr : int
-        Number of cells in r.
-
-    fieldtype : string
-        Either 'rho' or 'J'.
-    """
-    # Create empty arrays to store the four different possible
-    # cell directions a particle can deposit to.
-    if fieldtype == 'rho':
-        # Rho - third dimension represents 2 modes
-        rho0 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        rho1 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        rho2 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        rho3 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        return rho0, rho1, rho2, rho3
-
-    if fieldtype == 'J':
-        # J - third dimension represents 2 modes
-        # times 3 dimensions (r, t, z)
-        J0 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        J1 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        J2 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        J3 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        return J0, J1, J2, J3
+        Bz[i] = Fz
\ No newline at end of file
diff --git a/fbpic/particles/gathering/numba_methods.py b/fbpic/particles/gathering/numba_methods.py
new file mode 100644
index 000000000..f85a79ea0
--- /dev/null
+++ b/fbpic/particles/gathering/numba_methods.py
@@ -0,0 +1,82 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the field gathering methods linear and cubic order shapes 
+on the CPU with numba.
+"""
+import numba
+
+@numba.jit(nopython=True)
+def gather_field_numba(exptheta, m, Fgrid, Fptcl,
+                       iz, ir, Sz, Sr, sign_guards):
+    """
+    Perform the weighted sum using numba
+
+    Parameters
+    ----------
+    exptheta : 1darray of complexs
+        (one element per macroparticle)
+        Contains exp(-im theta) for each macroparticle
+
+    m : int
+        Index of the mode.
+        Determines wether a factor 2 should be applied
+
+    Fgrid : 2darray of complexs
+        Contains the fields on the interpolation grid,
+        from which to do the gathering
+
+    Fptcl : 1darray of floats
+        (one element per macroparticle)
+        Contains the fields for each macroparticle
+        Is modified by this function
+
+    iz, ir : 2darray of ints
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the index of the cells that each macroparticle
+        will gather from.
+
+    Sz, Sr: 2darray of floats
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the weight for respective cells from iz and ir,
+        for each macroparticle.
+
+    sign_guards : float
+       The sign (+1 or -1) with which the weight of the guard cells should
+       be added to the 0th cell.
+    """
+    # Get the total number of particles
+    Ntot = len(Fptcl)
+
+    # Loop over the particles
+    for ip in range(Ntot):
+        # Erase the temporary variable
+        F = 0.j
+        # Loop over all the adjacent cells (given by shape order)
+        # Use helper variables `ir_corr` and `Sr_corr`.
+        # This is necessary, because ir and Sr should **not** be modified
+        # **in-place**. (This is because ir and Sr are reused several
+        # times, as we call the present function 3 times, with different
+        # values for sign_guards.)
+        for cell_index_r in range(ir.shape[0]):
+            for cell_index_z in range(iz.shape[0]):
+                # Correct the guard cell index and sign
+                if ir[cell_index_r, ip] < 0:
+                    ir_corr = abs(ir[cell_index_r, ip]) - 1
+                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
+                else:
+                    ir_corr = ir[cell_index_r, ip]
+                    Sr_corr = Sr[cell_index_r, ip]
+                # Gather the field value at the respective grid point
+                F += Sz[cell_index_z, ip] * Sr_corr * \
+                    Fgrid[ iz[cell_index_z, ip], ir_corr]
+
+        # Add the complex phase
+        if m == 0:
+            Fptcl[ip] += (F * exptheta[ip]).real
+        if m > 0:
+            Fptcl[ip] += 2 * (F * exptheta[ip]).real
\ No newline at end of file
diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py
new file mode 100644
index 000000000..9809fcce5
--- /dev/null
+++ b/fbpic/particles/gathering/threading_methods.py
@@ -0,0 +1,601 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the field gathering methods linear and cubic order shapes
+on the CPU with threading
+"""
+from numba import prange, int64
+import numba
+import math
+import numpy as np
+
+# -----------------------
+# Field gathering linear
+# -----------------------
+
+@numba.njit(parallel=True)
+def gather_field_prange_linear(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba with multi-threading.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (linear).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Deposit the field per cell in parallel
+    for i in prange(x.shape[0]):
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt( xj**2 + yj**2 )
+        if (rj !=0. ) :
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else :
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get linear weights for the deposition
+        # --------------------------------------------
+        # Positions of the particles, in the cell unit
+        r_cell =  invdr*(rj - rmin) - 0.5
+        z_cell =  invdz*(zj - zmin) - 0.5
+        # Original index of the uppper and lower cell
+        ir_lower = int(math.floor( r_cell ))
+        ir_upper = ir_lower + 1
+        iz_lower = int(math.floor( z_cell ))
+        iz_upper = iz_lower + 1
+        # Linear weight
+        Sr_lower = ir_upper - r_cell
+        Sr_upper = r_cell - ir_lower
+        Sz_lower = iz_upper - z_cell
+        Sz_upper = z_cell - iz_lower
+        # Set guard weights to zero
+        Sr_guard = 0.
+
+        # Treat the boundary conditions
+        # --------------------------------------------
+        # guard cells in lower r
+        if ir_lower < 0:
+            Sr_guard = Sr_lower
+            Sr_lower = 0.
+            ir_lower = 0
+        # absorbing in upper r
+        if ir_lower > Nr-1:
+            ir_lower = Nr-1
+        if ir_upper > Nr-1:
+            ir_upper = Nr-1
+        # periodic boundaries in z
+        # lower z boundaries
+        if iz_lower < 0:
+            iz_lower += Nz
+        if iz_upper < 0:
+            iz_upper += Nz
+        # upper z boundaries
+        if iz_lower > Nz-1:
+            iz_lower -= Nz
+        if iz_upper > Nz-1:
+            iz_upper -= Nz
+
+        #Precalculate Shapes
+        S_ll = Sz_lower*Sr_lower
+        S_lu = Sz_lower*Sr_upper
+        S_ul = Sz_upper*Sr_lower
+        S_uu = Sz_upper*Sr_upper
+        S_lg = Sz_lower*Sr_guard
+        S_ug = Sz_upper*Sr_guard
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Ez_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Ez_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Er_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Et_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Er_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Et_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = cos*Fr - sin*Ft
+        Ey[i] = sin*Fr + cos*Ft
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Bz_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Bz_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ]
+
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Br_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Bt_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Br_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Bt_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+    return Ex, Ey, Ez, Bx, By, Bz
+
+# -----------------------
+# Field gathering cubic
+# -----------------------
+
+@numba.njit(parallel=True)
+def gather_field_prange_cubic(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba with multi-threading.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (cubic).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Deposit the field per cell in parallel
+    # (for threads < number of particles)
+    for i in prange(x.shape[0]):
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt(xj**2 + yj**2)
+        if (rj != 0.):
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else:
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get weights for the deposition
+        # --------------------------------------------
+        # Positions of the particle, in the cell unit
+        r_cell = invdr*(rj - rmin) - 0.5
+        z_cell = invdz*(zj - zmin) - 0.5
+
+        # Calculate the shape factors
+        ir = np.zeros(4, dtype=int64) #[0,0,0,0]
+        Sr = np.zeros(4) #[0.,0.,0.,0.]
+        ir[0] = int64(math.floor(r_cell)) - 1
+        ir[1] = ir[0] + 1
+        ir[2] = ir[1] + 1
+        ir[3] = ir[2] + 1
+        Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3
+        Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
+        Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
+        Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
+        iz = np.zeros(4, dtype=int64) #[0,0,0,0]
+        Sz = np.zeros(4) #[0.,0.,0.,0.]
+        iz[0] = int64(math.floor(z_cell)) - 1
+        iz[1] = iz[0] + 1
+        iz[2] = iz[1] + 1
+        iz[3] = iz[2] + 1
+        Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3
+        Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4)
+        Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
+        Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
+        # Lower and upper periodic boundary for z
+        index_z = 0
+        while index_z < 4:
+            if iz[index_z] < 0:
+                iz[index_z] += Nz
+            if iz[index_z] > Nz - 1:
+                iz[index_z] -= Nz
+            index_z += 1
+        # Lower and upper boundary for r
+        index_r = 0
+        while index_r < 4:
+            if ir[index_r] < 0:
+                ir[index_r] = abs(ir[index_r])-1
+                Sr[index_r] = (-1.)*Sr[index_r]
+            if ir[index_r] > Nr - 1:
+                ir[index_r] = Nr - 1
+            index_r += 1
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = (cos*Fr - sin*Ft)
+        Ey[i] = (sin*Fr + cos*Ft)
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                Fr_m += Sz[index_z]*Sr[index_r]* \
+                    Br_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]* \
+                    Bt_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+
+        # Add the fields for mode 1
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+    return Ex, Ey, Ez, Bx, By, Bz
diff --git a/fbpic/particles/numba_methods.py b/fbpic/particles/numba_methods.py
deleted file mode 100644
index 17d94a84a..000000000
--- a/fbpic/particles/numba_methods.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the optimized particles methods that use numba on a CPU
-"""
-import numba
-import math
-from scipy.constants import c, e
-
-# -----------------------
-# Particle pusher utility
-# -----------------------
-
-@numba.jit(nopython=True)
-def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
-    """
-    Advance the particles' positions over one half-timestep
-
-    This assumes that the positions (x, y, z) are initially either
-    one half-timestep *behind* the momenta (ux, uy, uz), or at the
-    same timestep as the momenta.
-    """
-    # Half timestep, multiplied by c
-    chdt = c*0.5*dt
-
-    # Particle push
-    for ip in range(Ntot) :
-        x[ip] += chdt * inv_gamma[ip] * ux[ip]
-        y[ip] += chdt * inv_gamma[ip] * uy[ip]
-        z[ip] += chdt * inv_gamma[ip] * uz[ip]
-
-@numba.jit(nopython=True)
-def push_p_numba( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
-    """
-    Advance the particles' momenta, using numba
-    """
-    # Set a few constants
-    econst = q*dt/(m*c)
-    bconst = 0.5*q*dt/m
-
-    # Loop over the particles
-    for ip in range(Ntot) :
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
-
-@numba.jit(nopython=True)
-def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
-    """
-    Advance the particles' momenta, using numba
-    """
-    # Set a few constants
-    prefactor_econst = e*dt/(m*c)
-    prefactor_bconst = 0.5*e*dt/m
-
-    # Loop over the particles
-    for ip in range(Ntot) :
-
-        # For neutral macroparticles, skip this step
-        if ionization_level[ip] == 0:
-            continue
-
-        # Calculate the charge dependent constants
-        econst = prefactor_econst * ionization_level[ip]
-        bconst = prefactor_bconst * ionization_level[ip]
-        # Perform the push
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
-            econst, bconst )
-
-@numba.jit(nopython=True)
-def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
-                Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
-    """
-    Push at single macroparticle, using the Vay pusher
-    """
-    # Get the magnetic rotation vector
-    taux = bconst*Bx
-    tauy = bconst*By
-    tauz = bconst*Bz
-    tau2 = taux**2 + tauy**2 + tauz**2
-
-    # Get the momenta at the half timestep
-    uxp = ux_i + econst*Ex \
-    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
-    uyp = uy_i + econst*Ey \
-    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
-    uzp = uz_i + econst*Ez \
-    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
-    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
-    utau = uxp*taux + uyp*tauy + uzp*tauz
-
-    # Get the new 1./gamma
-    inv_gamma_f = math.sqrt(
-        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
-
-    # Reuse the tau and utau variables to save memory
-    tx = inv_gamma_f*taux
-    ty = inv_gamma_f*tauy
-    tz = inv_gamma_f*tauz
-    ut = inv_gamma_f*utau
-    s = 1./( 1 + tau2*inv_gamma_f**2 )
-
-    # Get the new u
-    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
-    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
-    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
-
-    return( ux_f, uy_f, uz_f, inv_gamma_f )
-
-# -----------------------
-# Field gathering utility
-# -----------------------
-
-@numba.jit(nopython=True)
-def gather_field_numba(exptheta, m, Fgrid, Fptcl,
-                       iz, ir, Sz, Sr, sign_guards):
-    """
-    Perform the weighted sum using numba
-
-    Parameters
-    ----------
-    exptheta : 1darray of complexs
-        (one element per macroparticle)
-        Contains exp(-im theta) for each macroparticle
-
-    m : int
-        Index of the mode.
-        Determines wether a factor 2 should be applied
-
-    Fgrid : 2darray of complexs
-        Contains the fields on the interpolation grid,
-        from which to do the gathering
-
-    Fptcl : 1darray of floats
-        (one element per macroparticle)
-        Contains the fields for each macroparticle
-        Is modified by this function
-
-    iz, ir : 2darray of ints
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the index of the cells that each macroparticle
-        will gather from.
-
-    Sz, Sr: 2darray of floats
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the weight for respective cells from iz and ir,
-        for each macroparticle.
-
-    sign_guards : float
-       The sign (+1 or -1) with which the weight of the guard cells should
-       be added to the 0th cell.
-    """
-    # Get the total number of particles
-    Ntot = len(Fptcl)
-
-    # Loop over the particles
-    for ip in range(Ntot):
-        # Erase the temporary variable
-        F = 0.j
-        # Loop over all the adjacent cells (given by shape order)
-        # Use helper variables `ir_corr` and `Sr_corr`.
-        # This is necessary, because ir and Sr should **not** be modified
-        # **in-place**. (This is because ir and Sr are reused several
-        # times, as we call the present function 3 times, with different
-        # values for sign_guards.)
-        for cell_index_r in range(ir.shape[0]):
-            for cell_index_z in range(iz.shape[0]):
-                # Correct the guard cell index and sign
-                if ir[cell_index_r, ip] < 0:
-                    ir_corr = abs(ir[cell_index_r, ip]) - 1
-                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
-                else:
-                    ir_corr = ir[cell_index_r, ip]
-                    Sr_corr = Sr[cell_index_r, ip]
-                # Gather the field value at the respective grid point
-                F += Sz[cell_index_z, ip] * Sr_corr * \
-                    Fgrid[ iz[cell_index_z, ip], ir_corr]
-
-        # Add the complex phase
-        if m == 0:
-            Fptcl[ip] += (F * exptheta[ip]).real
-        if m > 0:
-            Fptcl[ip] += 2 * (F * exptheta[ip]).real
-
-# -------------------------
-# Charge deposition utility
-# -------------------------
-
-@numba.jit(nopython=True)
-def deposit_field_numba(Fptcl, Fgrid,
-        iz, ir, Sz, Sr, sign_guards):
-    """
-    Perform the deposition using numba
-
-    Parameters
-    ----------
-    Fptcl : 1darray of complexs
-        (one element per macroparticle)
-        Contains the charge or current for each macroparticle (already
-        multiplied by exp(im theta), from which to do the deposition
-
-    Fgrid : 2darray of complexs
-        Contains the fields on the interpolation grid.
-        Is modified by this function
-
-    iz, ir : 2darray of ints
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the index of the cells that each macroparticle
-        will deposit to.
-
-    Sz, Sr: 2darray of floats
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the weight for respective cells from iz and ir,
-        for each macroparticle.
-
-    sign_guards : float
-       The sign (+1 or -1) with which the weight of the guard cells should
-       be added to the 0th cell.
-    """
-
-    # Get the total number of particles
-    Ntot = len(Fptcl)
-
-    # Loop over all particles
-    for ip in range(Ntot):
-        # Loop over adjacent cells (given by shape order)
-        # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid
-        # modifying ir and Sr in place. (This is not strictly necessary,
-        # but is just here as a safeguard.)
-        for cell_index_r in range(ir.shape[0]):
-            for cell_index_z in range(iz.shape[0]):
-                # Correct the guard cell index and sign
-                if ir[cell_index_r, ip] < 0:
-                    ir_corr = abs(ir[cell_index_r, ip]) - 1
-                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
-                else:
-                    ir_corr = ir[cell_index_r, ip]
-                    Sr_corr = Sr[cell_index_r, ip]
-                # Deposit field from particle to the respective grid point
-                Fgrid[ iz[cell_index_z, ip], ir_corr ] += \
-                    Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip]
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index f9a15a945..63e4a871c 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -9,28 +9,37 @@
 from scipy.constants import c, e
 from .ionization import Ionizer
 from .tracking import ParticleTracker
+import numba
 
 # Load the utility methods
-from .utility_methods import weights, unalign_angles
-# Load the numba routines
-from .numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba, \
-        gather_field_numba, deposit_field_numba
+from .utilities.utility_methods import weights, unalign_angles
+# Load the numba methods
+from .push.numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba
+from .deposition.numba_methods import deposit_field_numba
+from .gathering.numba_methods import gather_field_numba
+
+# Check if threading is available, then import threaded functions
+from fbpic.threading_utils import threading_enabled
+if threading_enabled:
+    from .deposition.threading_methods import deposit_rho_prange_linear, \
+        deposit_J_prange_linear, deposit_rho_prange_cubic, \
+        deposit_J_prange_cubic, sum_reduce_2d_array
+    from .gathering.threading_methods import gather_field_prange_linear, \
+        gather_field_prange_cubic
 
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
+    # Load the CUDA methods
     from fbpic.cuda_utils import cuda, cuda_tpb_bpg_1d, cuda_tpb_bpg_2d
-    from .cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu, \
-        gather_field_gpu_linear, gather_field_gpu_cubic, \
-        write_sorting_buffer, cuda_deposition_arrays, \
+    from .push.cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu
+    from .deposition.cuda_methods import deposit_rho_gpu_linear, \
+        deposit_J_gpu_linear, deposit_rho_gpu_cubic, deposit_J_gpu_cubic
+    from .gathering.cuda_methods import gather_field_gpu_linear, \
+        gather_field_gpu_cubic
+    from .utilities.cuda_sorting import write_sorting_buffer, \
         get_cell_idx_per_particle, sort_particles_per_cell, \
         reset_prefix_sum, incl_prefix_sum
-    from .cuda_deposition.cubic import deposit_rho_gpu_cubic, \
-        deposit_J_gpu_cubic
-    from .cuda_deposition.linear import deposit_rho_gpu_linear, \
-        deposit_J_gpu_linear
-    from .cuda_deposition.linear_non_atomic import deposit_rho_gpu, \
-        deposit_J_gpu, add_rho, add_J
 
 class Particles(object) :
     """
@@ -50,7 +59,8 @@ def __init__(self, q, m, n, Npz, zmin, zmax,
                     ux_m=0., uy_m=0., uz_m=0.,
                     ux_th=0., uy_th=0., uz_th=0.,
                     dens_func=None, continuous_injection=True,
-                    use_cuda=False, grid_shape=None, particle_shape='linear' ) :
+                    grid_shape=None, particle_shape='linear',
+                    use_cuda=False ) :
         """
         Initialize a uniform set of particles
 
@@ -100,9 +110,6 @@ def dens_func( z, r ) ...
            Whether to continuously inject the particles,
            in the case of a moving window
 
-        use_cuda : bool, optional
-            Wether to use the GPU or not.
-
         grid_shape: tuple, optional
             Needed when running on the GPU
             The shape of the local grid (including guard cells), i.e.
@@ -111,15 +118,16 @@ def dens_func( z, r ) ...
 
         particle_shape: str, optional
             Set the particle shape for the charge/current deposition.
-            Possible values are 'cubic', 'linear' and 'linear_non_atomic'.
-            While 'cubic' corresponds to third order shapes and 'linear'
-            to first order shapes, 'linear_non_atomic' uses an equivalent
-            deposition scheme to 'linear' which avoids atomics on the GPU.
+            Possible values are 'linear' and 'cubic' for first and third
+            order particle shape factors.
+
+        use_cuda : bool, optional
+            Wether to use the GPU or not.
         """
         # Register the timestep
         self.dt = dt
 
-        # Define wether or not to use the GPU
+        # Define whether or not to use the GPU
         self.use_cuda = use_cuda
         if (self.use_cuda==True) and (cuda_installed==False) :
             print('*** Cuda not available for the particles.')
@@ -198,6 +206,9 @@ def dens_func( z, r ) ...
             if dens_func is not None :
                 self.w[:] = self.w * dens_func( self.z, r )
 
+        # Register particle shape
+        self.particle_shape = particle_shape
+
         # Allocate arrays and register variables when using CUDA
         if self.use_cuda:
             if grid_shape is None:
@@ -213,9 +224,11 @@ def dens_func( z, r ) ...
                                         dtype=np.int32 )
             # Register boolean that records if the particles are sorted or not
             self.sorted = False
-
-        # Register particle shape
-        self.particle_shape = particle_shape
+        # Register variables when using multithreading
+        self.use_threading = threading_enabled
+        if self.use_threading == True:
+            # Register number of threads
+            self.nthreads = numba.config.NUMBA_NUM_THREADS
 
     def send_particles_to_gpu( self ):
         """
@@ -423,6 +436,7 @@ def push_p( self ) :
         half-timestep *behind* the positions (x, y, z), and it brings
         them one half-timestep *ahead* of the positions.
         """
+        # GPU (CUDA) version
         if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
@@ -441,7 +455,8 @@ def push_p( self ) :
                     self.Ex, self.Ey, self.Ez,
                     self.Bx, self.By, self.Bz,
                     self.m, self.Ntot, self.dt, self.ionizer.ionization_level )
-        else :
+        # CPU version
+        else:
             if self.ionizer is None:
                 push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma,
                     self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz,
@@ -461,6 +476,7 @@ def halfpush_x( self ) :
         one half-timestep *behind* the momenta (ux, uy, uz), or at the
         same timestep as the momenta.
         """
+        # GPU (CUDA) version
         if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
@@ -471,7 +487,8 @@ def halfpush_x( self ) :
                 self.inv_gamma, self.dt )
             # The particle array is unsorted after the push in x
             self.sorted = False
-        else :
+        # CPU version
+        else:
             push_x_numba( self.x, self.y, self.z,
                 self.ux, self.uy, self.uz,
                 self.inv_gamma, self.Ntot, self.dt )
@@ -489,12 +506,24 @@ def gather( self, grid ) :
              (one InterpolationGrid object per azimuthal mode)
              Contains the field values on the interpolation grid
         """
-        if self.use_cuda == True:
+        # GPU (CUDA) version
+        if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
             # Call the CUDA Kernel for the gathering of E and B Fields
             # for Mode 0 and 1 only.
-            if self.particle_shape == 'cubic':
+            if self.particle_shape == 'linear':
+                gather_field_gpu_linear[dim_grid_1d, dim_block_1d](
+                     self.x, self.y, self.z,
+                     grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                     grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                     grid[0].Er, grid[0].Et, grid[0].Ez,
+                     grid[1].Er, grid[1].Et, grid[1].Ez,
+                     grid[0].Br, grid[0].Bt, grid[0].Bz,
+                     grid[1].Br, grid[1].Bt, grid[1].Bz,
+                     self.Ex, self.Ey, self.Ez,
+                     self.Bx, self.By, self.Bz)
+            elif self.particle_shape == 'cubic':
                 gather_field_gpu_cubic[dim_grid_1d, dim_block_1d](
                      self.x, self.y, self.z,
                      grid[0].invdz, grid[0].zmin, grid[0].Nz,
@@ -506,7 +535,24 @@ def gather( self, grid ) :
                      self.Ex, self.Ey, self.Ez,
                      self.Bx, self.By, self.Bz)
             else:
-                gather_field_gpu_linear[dim_grid_1d, dim_block_1d](
+                raise ValueError("`particle_shape` should be either \
+                                  'linear' or 'cubic' \
+                                   but is `%s`" % self.particle_shape)
+        # CPU multi-threading version
+        elif self.use_threading:
+            if self.particle_shape == 'linear':
+                gather_field_prange_linear(
+                     self.x, self.y, self.z,
+                     grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                     grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                     grid[0].Er, grid[0].Et, grid[0].Ez,
+                     grid[1].Er, grid[1].Et, grid[1].Ez,
+                     grid[0].Br, grid[0].Bt, grid[0].Bz,
+                     grid[1].Br, grid[1].Bt, grid[1].Bz,
+                     self.Ex, self.Ey, self.Ez,
+                     self.Bx, self.By, self.Bz)
+            elif self.particle_shape == 'cubic':
+                gather_field_prange_cubic(
                      self.x, self.y, self.z,
                      grid[0].invdz, grid[0].zmin, grid[0].Nz,
                      grid[0].invdr, grid[0].rmin, grid[0].Nr,
@@ -516,6 +562,11 @@ def gather( self, grid ) :
                      grid[1].Br, grid[1].Bt, grid[1].Bz,
                      self.Ex, self.Ey, self.Ez,
                      self.Bx, self.By, self.Bz)
+            else:
+                raise ValueError("`particle_shape` should be either \
+                                  'linear' or 'cubic' \
+                                   but is `%s`" % self.particle_shape)
+        # CPU single-core version
         else:
             # Preliminary arrays for the cylindrical conversion
             r = np.sqrt( self.x**2 + self.y**2 )
@@ -624,19 +675,13 @@ def deposit( self, fld, fieldtype ) :
         """
         # Shortcut for the list of InterpolationGrid objects
         grid = fld.interp
-
-        if self.use_cuda == True:
+        # GPU (CUDA) version
+        if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_2d_flat, dim_block_2d_flat = cuda_tpb_bpg_1d(
                                                     grid[0].Nz*grid[0].Nr )
             dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d(
                                           grid[0].Nz, grid[0].Nr )
-
-            # Create the helper arrays for deposition
-            if self.particle_shape == 'linear_non_atomic':
-                d_F0, d_F1, d_F2, d_F3 = cuda_deposition_arrays(
-                    grid[0].Nz, grid[0].Nr, fieldtype=fieldtype)
-
             # Sort the particles
             if self.sorted is False:
                 self.sort_particles(fld=fld)
@@ -648,17 +693,13 @@ def deposit( self, fld, fieldtype ) :
             # Rho
             if fieldtype == 'rho':
                 # Deposit rho in each of four directions
-                if self.particle_shape == 'linear_non_atomic':
-                    deposit_rho_gpu[dim_grid_2d_flat, dim_block_2d_flat](
+                if self.particle_shape == 'linear':
+                    deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        d_F0, d_F1, d_F2, d_F3,
-                        self.cell_idx, self.prefix_sum)
-                    # Add the four directions together
-                    add_rho[dim_grid_2d, dim_block_2d](
                         grid[0].rho, grid[1].rho,
-                        d_F0, d_F1, d_F2, d_F3)
+                        self.cell_idx, self.prefix_sum)
                 elif self.particle_shape == 'cubic':
                     deposit_rho_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
@@ -666,33 +707,23 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
                         grid[0].rho, grid[1].rho,
                         self.cell_idx, self.prefix_sum)
-                elif self.particle_shape == 'linear':
-                    deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
-                        self.x, self.y, self.z, self.w,
-                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
-                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        grid[0].rho, grid[1].rho,
-                        self.cell_idx, self.prefix_sum)
                 else:
-                    raise ValueError("`particle_shape` should be either 'linear', 'linear_atomic' \
-                                      or 'cubic' but is `%s`" % self.particle_shape)
+                    raise ValueError("`particle_shape` should be either \
+                                      'linear' or 'cubic' \
+                                       but is `%s`" % self.particle_shape)
             # J
             elif fieldtype == 'J':
                 # Deposit J in each of four directions
-                if self.particle_shape == 'linear_non_atomic':
-                    deposit_J_gpu[dim_grid_2d_flat, dim_block_2d_flat](
+                if self.particle_shape == 'linear':
+                    deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
                         self.ux, self.uy, self.uz, self.inv_gamma,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        d_F0, d_F1, d_F2, d_F3,
-                        self.cell_idx, self.prefix_sum)
-                    # Add the four directions together
-                    add_J[dim_grid_2d, dim_block_2d](
                         grid[0].Jr, grid[1].Jr,
                         grid[0].Jt, grid[1].Jt,
                         grid[0].Jz, grid[1].Jz,
-                        d_F0, d_F1, d_F2, d_F3)
+                        self.cell_idx, self.prefix_sum)
                 elif self.particle_shape == 'cubic':
                     deposit_J_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
@@ -703,26 +734,114 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].Jt, grid[1].Jt,
                         grid[0].Jz, grid[1].Jz,
                         self.cell_idx, self.prefix_sum)
-                elif self.particle_shape == 'linear':
-                    deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
+                else:
+                    raise ValueError("`particle_shape` should be either \
+                                      'linear' or 'cubic' \
+                                       but is `%s`" % self.particle_shape)
+            else:
+                raise ValueError("`fieldtype` should be either 'J' or \
+                                  'rho', but is `%s`" % fieldtype)
+        # CPU multi-threading version
+        elif self.use_threading:
+
+            # Divide particles in chunks (each chunk is handled by a different
+            # thread) and register the indices that bound each chunks
+            n_avg_per_thread = int( self.Ntot/self.nthreads )
+            # Attribute n_avg_per_thread to each thread (except the last one)
+            ptcl_chunk_indices = np.array(
+                [ i_chk*n_avg_per_thread for i_chk in range(self.nthreads+1) ],
+                dtype=np.uint64 )
+            ptcl_chunk_indices[-1] = self.Ntot
+            # Multithreading functions for the deposition of rho or J
+            # for Mode 0 and 1 only.
+            if fieldtype == 'rho':
+                # Generate temporary arrays for rho
+                rho_m0_global = np.zeros(
+                    (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]),
+                    dtype=grid[0].rho.dtype )
+                rho_m1_global = np.zeros(
+                    (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]),
+                    dtype=grid[1].rho.dtype )
+                # Deposit rho using CPU threading
+                if self.particle_shape == 'linear':
+                    deposit_rho_prange_linear(
+                        self.x, self.y, self.z, self.w,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        rho_m0_global, rho_m1_global,
+                        self.nthreads, ptcl_chunk_indices )
+                elif self.particle_shape == 'cubic':
+                    deposit_rho_prange_cubic(
+                        self.x, self.y, self.z, self.w,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        rho_m0_global, rho_m1_global,
+                        self.nthreads, ptcl_chunk_indices )
+                else:
+                    raise ValueError("`particle_shape` should be either \
+                                      'linear' or 'cubic' \
+                                       but is `%s`" % self.particle_shape)
+                # Sum thread-local results to main field array
+                sum_reduce_2d_array( rho_m0_global, grid[0].rho )
+                sum_reduce_2d_array( rho_m1_global, grid[1].rho )
+
+            elif fieldtype == 'J':
+                # Generate temporary arrays for J
+                Jr_m0_global = np.zeros(
+                    (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]),
+                    dtype=grid[0].Jr.dtype )
+                Jt_m0_global = np.zeros(
+                    (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]),
+                    dtype=grid[0].Jt.dtype )
+                Jz_m0_global = np.zeros(
+                    (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]),
+                    dtype=grid[0].Jz.dtype )
+                Jr_m1_global = np.zeros(
+                    (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]),
+                    dtype=grid[1].Jr.dtype )
+                Jt_m1_global = np.zeros(
+                    (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]),
+                    dtype=grid[1].Jt.dtype )
+                Jz_m1_global = np.zeros(
+                    (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]),
+                    dtype=grid[1].Jz.dtype )
+                # Deposit J using CPU threading
+                if self.particle_shape == 'linear':
+                    deposit_J_prange_linear(
                         self.x, self.y, self.z, self.w,
                         self.ux, self.uy, self.uz, self.inv_gamma,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        grid[0].Jr, grid[1].Jr,
-                        grid[0].Jt, grid[1].Jt,
-                        grid[0].Jz, grid[1].Jz,
-                        self.cell_idx, self.prefix_sum)
+                        Jr_m0_global, Jr_m1_global,
+                        Jt_m0_global, Jt_m1_global,
+                        Jz_m0_global, Jz_m1_global,
+                        self.nthreads, ptcl_chunk_indices )
+                elif self.particle_shape == 'cubic':
+                    deposit_J_prange_cubic(
+                        self.x, self.y, self.z, self.w,
+                        self.ux, self.uy, self.uz, self.inv_gamma,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        Jr_m0_global, Jr_m1_global,
+                        Jt_m0_global, Jt_m1_global,
+                        Jz_m0_global, Jz_m1_global,
+                        self.nthreads, ptcl_chunk_indices )
                 else:
                     raise ValueError("`particle_shape` should be either \
-                                      'linear', 'linear_atomic' or 'cubic' \
+                                      'linear' or 'cubic' \
                                        but is `%s`" % self.particle_shape)
+                # Sum thread-local results to main field array
+                sum_reduce_2d_array( Jr_m0_global, grid[0].Jr )
+                sum_reduce_2d_array( Jt_m0_global, grid[0].Jt )
+                sum_reduce_2d_array( Jz_m0_global, grid[0].Jz )
+                sum_reduce_2d_array( Jr_m1_global, grid[1].Jr )
+                sum_reduce_2d_array( Jt_m1_global, grid[1].Jt )
+                sum_reduce_2d_array( Jz_m1_global, grid[1].Jz )
+
             else:
                 raise ValueError("`fieldtype` should be either 'J' or \
                                   'rho', but is `%s`" % fieldtype)
-
-
-        # CPU version
+        # CPU single-core version
         else:
             # Preliminary arrays for the cylindrical conversion
             r = np.sqrt( self.x**2 + self.y**2 )
@@ -745,7 +864,7 @@ def deposit( self, fld, fieldtype ) :
             # number of elements in the grid list
             Nm = len(grid)
 
-            if fieldtype == 'rho' :
+            if fieldtype == 'rho':
                 # ---------------------------------------
                 # Deposit the charge density mode by mode
                 # ---------------------------------------
@@ -765,7 +884,7 @@ def deposit( self, fld, fieldtype ) :
                     deposit_field_numba(self.w*exptheta, grid[m].rho,
                                             iz, ir, Sz, Sr, -1.)
 
-            elif fieldtype == 'J' :
+            elif fieldtype == 'J':
                 # ----------------------------------------
                 # Deposit the current density mode by mode
                 # ----------------------------------------
@@ -793,9 +912,9 @@ def deposit( self, fld, fieldtype ) :
                     deposit_field_numba(Jz*exptheta, grid[m].Jz,
                                         iz, ir, Sz, Sr, -1.)
 
-            else :
-                raise ValueError(
-        "`fieldtype` should be either 'J' or 'rho', but is `%s`" %fieldtype )
+            else:
+                raise ValueError("`fieldtype` should be either 'J' or \
+                                  'rho', but is `%s`" % fieldtype)
 
     def sort_particles(self, fld):
         """
diff --git a/fbpic/particles/push/__init__.py b/fbpic/particles/push/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/push/cuda_methods.py b/fbpic/particles/push/cuda_methods.py
new file mode 100644
index 000000000..41d48d12a
--- /dev/null
+++ b/fbpic/particles/push/cuda_methods.py
@@ -0,0 +1,191 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle push methods on the GPU using CUDA.
+"""
+from numba import cuda
+import math
+from scipy.constants import c, e
+
+@cuda.jit('void(float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64)')
+def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) :
+    """
+    Advance the particles' positions over one half-timestep
+
+    This assumes that the positions (x, y, z) are initially either
+    one half-timestep *behind* the momenta (ux, uy, uz), or at the
+    same timestep as the momenta.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+        (is modified by this function)
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    dt : float (seconds)
+        The time by which the position is advanced
+    """
+    # Half timestep, multiplied by c
+    chdt = c*0.5*dt
+
+    i = cuda.grid(1)
+    if i < x.shape[0]:
+        # Particle push
+        inv_g = inv_gamma[i]
+        x[i] += chdt*inv_g*ux[i]
+        y[i] += chdt*inv_g*uy[i]
+        z[i] += chdt*inv_g*uz[i]
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64, float64, int32, float64)')
+def push_p_gpu( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz,
+                q, m, Ntot, dt ) :
+    """
+    Advance the particles' momenta, using cuda on the GPU
+
+    Parameters
+    ----------
+    ux, uy, uz : 1darray of floats
+        The velocity of the particles
+        (is modified by this function)
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+
+    q : float
+        The charge of the particle species
+
+    m : float
+        The mass of the particle species
+
+    Ntot : int
+        The total number of particles
+
+    dt : float
+        The time by which the momenta is advanced
+    """
+    # Set a few constants
+    econst = q*dt/(m*c)
+    bconst = 0.5*q*dt/m
+
+    #Cuda 1D grid
+    ip = cuda.grid(1)
+
+    # Loop over the particles
+    if ip < Ntot:
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64, int32, float64, int16[:])')
+def push_p_ioniz_gpu( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz,
+                m, Ntot, dt, ionization_level ) :
+    """
+    Advance the particles' momenta, using numba on the GPU
+    This take into account that the particles are ionizable, and thus
+    that their charge is determined by `ionization_level`
+
+    Parameters
+    ----------
+    ux, uy, uz : 1darray of floats
+        The velocity of the particles
+        (is modified by this function)
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+
+    m : float
+        The mass of the particle species
+
+    Ntot : int
+        The total number of particles
+
+    dt : float
+        The time by which the momenta is advanced
+
+    ionization_level : 1darray of ints
+        The number of electrons that each ion is missing
+        (compared to a neutral atom)
+    """
+    #Cuda 1D grid
+    ip = cuda.grid(1)
+
+    # Loop over the particles
+    if ip < Ntot:
+        if ionization_level[ip] != 0:
+            # Set a few constants
+            econst = ionization_level[ip] * e * dt/(m*c)
+            bconst = 0.5 * ionization_level[ip] * e * dt/m
+            # Use the Vay pusher
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+                ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+                Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
+
+@cuda.jit(device=True, inline=True)
+def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
+    Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
+    """
+    Push at single macroparticle, using the Vay pusher
+    """
+    # Get the magnetic rotation vector
+    taux = bconst*Bx
+    tauy = bconst*By
+    tauz = bconst*Bz
+    tau2 = taux**2 + tauy**2 + tauz**2
+
+    # Get the momenta at the half timestep
+    uxp = ux_i + econst*Ex \
+    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
+    uyp = uy_i + econst*Ey \
+    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
+    uzp = uz_i + econst*Ez \
+    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
+    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
+    utau = uxp*taux + uyp*tauy + uzp*tauz
+
+    # Get the new 1./gamma
+    inv_gamma_f = math.sqrt(
+        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
+
+    # Reuse the tau and utau arrays to save memory
+    tx = inv_gamma_f*taux
+    ty = inv_gamma_f*tauy
+    tz = inv_gamma_f*tauz
+    ut = inv_gamma_f*utau
+    s = 1./( 1 + tau2*inv_gamma_f**2 )
+
+    # Get the new u
+    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
+    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
+    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
+
+    return( ux_f, uy_f, uz_f, inv_gamma_f )
\ No newline at end of file
diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py
new file mode 100644
index 000000000..6e3843e1b
--- /dev/null
+++ b/fbpic/particles/push/numba_methods.py
@@ -0,0 +1,117 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle push methods on the CPU with numba.
+"""
+import math
+import numba
+from fbpic.threading_utils import njit_parallel, prange
+from scipy.constants import c, e
+
+@njit_parallel
+def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
+    """
+    Advance the particles' positions over one half-timestep
+
+    This assumes that the positions (x, y, z) are initially either
+    one half-timestep *behind* the momenta (ux, uy, uz), or at the
+    same timestep as the momenta.
+    """
+    # Half timestep, multiplied by c
+    chdt = c*0.5*dt
+
+    # Particle push (in parallel if threading is installed)
+    for ip in prange(Ntot) :
+        x[ip] += chdt * inv_gamma[ip] * ux[ip]
+        y[ip] += chdt * inv_gamma[ip] * uy[ip]
+        z[ip] += chdt * inv_gamma[ip] * uz[ip]
+
+    return x, y, z
+
+@njit_parallel
+def push_p_numba( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
+    """
+    Advance the particles' momenta, using numba
+    """
+    # Set a few constants
+    econst = q*dt/(m*c)
+    bconst = 0.5*q*dt/m
+
+    # Loop over the particles (in parallel if threading is installed)
+    for ip in prange(Ntot) :
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
+
+    return ux, uy, uz, inv_gamma
+
+@njit_parallel
+def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
+    """
+    Advance the particles' momenta, using numba
+    """
+    # Set a few constants
+    prefactor_econst = e*dt/(m*c)
+    prefactor_bconst = 0.5*e*dt/m
+
+    # Loop over the particles (in parallel if threading is installed)
+    for ip in prange(Ntot) :
+
+        # For neutral macroparticles, skip this step
+        if ionization_level[ip] == 0:
+            continue
+
+        # Calculate the charge dependent constants
+        econst = prefactor_econst * ionization_level[ip]
+        bconst = prefactor_bconst * ionization_level[ip]
+        # Perform the push
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
+            econst, bconst )
+
+    return ux, uy, uz, inv_gamma
+
+@numba.njit
+def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
+                Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
+    """
+    Push at single macroparticle, using the Vay pusher
+    """
+    # Get the magnetic rotation vector
+    taux = bconst*Bx
+    tauy = bconst*By
+    tauz = bconst*Bz
+    tau2 = taux**2 + tauy**2 + tauz**2
+
+    # Get the momenta at the half timestep
+    uxp = ux_i + econst*Ex \
+    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
+    uyp = uy_i + econst*Ey \
+    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
+    uzp = uz_i + econst*Ez \
+    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
+    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
+    utau = uxp*taux + uyp*tauy + uzp*tauz
+
+    # Get the new 1./gamma
+    inv_gamma_f = math.sqrt(
+        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
+
+    # Reuse the tau and utau variables to save memory
+    tx = inv_gamma_f*taux
+    ty = inv_gamma_f*tauy
+    tz = inv_gamma_f*tauz
+    ut = inv_gamma_f*utau
+    s = 1./( 1 + tau2*inv_gamma_f**2 )
+
+    # Get the new u
+    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
+    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
+    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
+
+    return( ux_f, uy_f, uz_f, inv_gamma_f )
diff --git a/fbpic/particles/utilities/__init__.py b/fbpic/particles/utilities/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/utilities/cuda_sorting.py b/fbpic/particles/utilities/cuda_sorting.py
new file mode 100644
index 000000000..45a1ddb9f
--- /dev/null
+++ b/fbpic/particles/utilities/cuda_sorting.py
@@ -0,0 +1,184 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle sorting methods on the GPU using CUDA.
+"""
+from numba import cuda
+from accelerate.cuda import sorting
+import math
+import numpy as np
+
+# -----------------------------------------------------
+# Sorting utilities - get_cell_idx / sort / prefix_sum
+# -----------------------------------------------------
+
+@cuda.jit('void(int32[:], uint32[:], \
+                float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32)')
+def get_cell_idx_per_particle(cell_idx, sorted_idx,
+                              x, y, z,
+                              invdz, zmin, Nz,
+                              invdr, rmin, Nr):
+    """
+    Get the cell index of each particle.
+    The cell index is 1d and calculated by:
+    cell index in z + cell index in r * number of cells in z.
+    The cell_idx of a particle is defined by
+    the lower cell in r and z, that it deposits its field to.
+
+    Parameters
+    ----------
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    sorted_idx : 1darray of integers
+        The sorted index array needs to be reset
+        before doing the sort
+
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box, in each direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+    """
+    i = cuda.grid(1)
+    if i < cell_idx.shape[0]:
+            # Preliminary arrays for the cylindrical conversion
+            xj = x[i]
+            yj = y[i]
+            zj = z[i]
+            rj = math.sqrt( xj**2 + yj**2 )
+
+            # Positions of the particles, in the cell unit
+            r_cell =  invdr*(rj - rmin) - 0.5
+            z_cell =  invdz*(zj - zmin) - 0.5
+
+            # Original index of the uppper and lower cell
+            ir_lower = int(math.floor( r_cell ))
+            iz_lower = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_lower < 0:
+                ir_lower = 0
+            # absorbing in upper r
+            if ir_lower > Nr-1:
+                ir_lower = Nr-1
+            # periodic boundaries in z
+            if iz_lower < 0:
+                iz_lower += Nz
+            if iz_lower > Nz-1:
+                iz_lower -= Nz
+
+            # Reset sorted_idx array
+            sorted_idx[i] = i
+            # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr
+            cell_idx[i] = ir_lower + iz_lower * Nr
+
+def sort_particles_per_cell(cell_idx, sorted_idx):
+    """
+    Sort the cell index of the particles and
+    modify the sorted index array accordingly.
+
+    Parameters
+    ----------
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    sorted_idx : 1darray of integers
+        Represents the original index of the
+        particle before the sorting.
+    """
+    Ntot = cell_idx.shape[0]
+    if Ntot > 0:
+        sorter = sorting.RadixSort(Ntot, dtype = np.int32)
+        sorter.sort(cell_idx, vals = sorted_idx)
+
+@cuda.jit('void(int32[:], int32[:])')
+def incl_prefix_sum(cell_idx, prefix_sum):
+    """
+    Perform an inclusive parallel prefix sum on the sorted
+    cell index array. The prefix sum array represents the
+    cumulative sum of the number of particles per cell
+    for each cell index.
+
+    Parameters
+    ----------
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # i is the index of the macroparticle
+    i = cuda.grid(1)
+    if i < cell_idx.shape[0]-1:
+        # ci: index of the cell of the present macroparticle
+        ci = cell_idx[i]
+        # ci_next: index of the cell of the next macroparticle
+        ci_next = cell_idx[i+1]
+        # Fill all the cells between ci and ci_next with the
+        # inclusive cumulative sum of the number particles until ci
+        while ci < ci_next:
+            # The cumulative sum of the number of particle per cell
+            # until ci is i+1 (since i obeys python index, starting at 0)
+            prefix_sum[ci] = i+1
+            ci += 1
+    # The last "macroparticle" of the cell_idx array fills up the
+    # rest of the prefix sum array
+    if i == cell_idx.shape[0]-1:
+        # Get the cell_index of the last macroparticle
+        ci = cell_idx[i]
+        # Fill all the remaining entries of the prefix sum array
+        for empty_index in range(ci, prefix_sum.shape[0]):
+            prefix_sum[empty_index] = i+1
+
+@cuda.jit('void(int32[:])')
+def reset_prefix_sum(prefix_sum):
+    """
+    Resets the prefix sum. Sets all the values
+    to zero.
+
+    Parameters
+    ----------
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    i = cuda.grid(1)
+    if i < prefix_sum.shape[0]:
+        prefix_sum[i] = 0
+
+@cuda.jit('void(uint32[:], float64[:], float64[:])')
+def write_sorting_buffer(sorted_idx, val, buf):
+    """
+    Writes the values of a particle array to a buffer,
+    while rearranging them to match the sorted cell index array.
+
+    Parameters
+    ----------
+    sorted_idx : 1darray of integers
+        Represents the original index of the
+        particle before the sorting
+
+    val : 1d array of floats
+        A particle data array
+
+    buf : 1d array of floats
+        A buffer array to temporarily store the
+        sorted particle data array
+    """
+    i = cuda.grid(1)
+    if i < val.shape[0]:
+        buf[i] = val[sorted_idx[i]]
diff --git a/fbpic/particles/utility_methods.py b/fbpic/particles/utilities/utility_methods.py
similarity index 98%
rename from fbpic/particles/utility_methods.py
rename to fbpic/particles/utilities/utility_methods.py
index d0de6c5b8..d79019f0d 100644
--- a/fbpic/particles/utility_methods.py
+++ b/fbpic/particles/utilities/utility_methods.py
@@ -3,7 +3,7 @@
 # License: 3-Clause-BSD-LBNL
 """
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the optimized particles methods that use numba on a CPU
+It defines particle utility methods.
 """
 import numpy as np
 
diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py
new file mode 100644
index 000000000..a35d9a8c5
--- /dev/null
+++ b/fbpic/threading_utils.py
@@ -0,0 +1,39 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines a set of generic functions for multithreaded CPU execution.
+"""
+import os
+from numba import njit
+
+# By default threading is enabled
+threading_enabled = True
+
+# Check if the environment variable FBPIC_DISABLE_THREADING is set to 1
+# and in that case, disable threading
+if 'FBPIC_DISABLE_THREADING' in os.environ:
+    if int(os.environ['FBPIC_DISABLE_THREADING']) == 1:
+        threading_enabled = False
+
+# If the user request threading (by not setting FBPIC_DISABLE_THREADING)
+# check if it is indeed installed
+if threading_enabled:
+    try:
+        # Try to import the threading function prange
+        from numba import prange as numba_prange
+    except ImportError:
+        threading_enabled = False
+        print('*** Threading not available for the simulation.')
+        print('*** (Please make sure that numba>0.34 is installed)')
+
+# Set the function njit_parallel and prange to the correct object
+if not threading_enabled:
+    # Use regular serial compilation function
+    njit_parallel = njit
+    prange = range
+else:
+    # Use the parallel compilation function
+    njit_parallel = njit( parallel=True )
+    prange = numba_prange
diff --git a/tests/test_uniform_rho_deposition.py b/tests/test_uniform_rho_deposition.py
index e0a7978e6..8978f9625 100644
--- a/tests/test_uniform_rho_deposition.py
+++ b/tests/test_uniform_rho_deposition.py
@@ -10,7 +10,7 @@
   (i.e. this confirms that no Verboncoeur-type correction is needed)
 - Shifting this plasma by a small amount in r, and still verifying
   that the deposited density is uniform
- The tests are performed with different particle shapes: linear
+ The tests are performed with different particle shapes: linear, cubic
 
 Usage :
 from the top-level directory of FBPIC run
@@ -47,7 +47,7 @@
 
 def test_uniform_electron_plasma(show=False):
     "Function that is run by py.test, when doing `python setup.py test`"
-    for shape in ['cubic', 'linear', 'linear_non_atomic']:
+    for shape in ['linear', 'cubic']:
         uniform_electron_plasma( shape, show )
 
 def uniform_electron_plasma(shape, show=False):
@@ -87,7 +87,7 @@ def uniform_electron_plasma(shape, show=False):
 
 def test_neutral_plasma_shifted(show=False):
     "Function that is run by py.test, when doing `python setup.py test`"
-    for shape in ['cubic', 'linear', 'linear_non_atomic']:
+    for shape in ['linear', 'cubic']:
         neutral_plasma_shifted( shape, show )
 
 def neutral_plasma_shifted(shape, show=False):