From 9ae48f399068c53e3fef8e1f76c4b534edc25ec4 Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Fri, 14 Jul 2017 18:51:03 +0200
Subject: [PATCH 01/36] Initial CPU multi-threading implementation

---
 fbpic/main.py                                 |   51 +-
 fbpic/particles/cuda_deposition/__init__.py   |    0
 fbpic/particles/cuda_deposition/cubic.py      | 1214 -----------------
 fbpic/particles/cuda_deposition/linear.py     |  482 -------
 .../cuda_deposition/linear_non_atomic.py      |  671 ---------
 fbpic/particles/cuda_methods.py               |  997 --------------
 fbpic/particles/numba_methods.py              |  251 ----
 fbpic/particles/particles.py                  |  285 +++-
 fbpic/particles/utility_methods.py            |  155 ---
 9 files changed, 247 insertions(+), 3859 deletions(-)
 delete mode 100644 fbpic/particles/cuda_deposition/__init__.py
 delete mode 100644 fbpic/particles/cuda_deposition/cubic.py
 delete mode 100644 fbpic/particles/cuda_deposition/linear.py
 delete mode 100644 fbpic/particles/cuda_deposition/linear_non_atomic.py
 delete mode 100644 fbpic/particles/cuda_methods.py
 delete mode 100644 fbpic/particles/numba_methods.py
 delete mode 100644 fbpic/particles/utility_methods.py

diff --git a/fbpic/main.py b/fbpic/main.py
index f508b0f69..f33554966 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -10,6 +10,7 @@
 # (This needs to be done before the other imports,
 # as it sets the cuda context)
 from mpi4py import MPI
+import numba
 # Check if CUDA is available, then import CUDA functions
 from .cuda_utils import cuda_installed
 if cuda_installed:
@@ -41,12 +42,11 @@ class Simulation(object):
     def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax,
                  p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0.,
                  n_order=-1, dens_func=None, filter_currents=True,
-                 v_comoving=None, use_galilean=False,
-                 initialize_ions=False, use_cuda=False,
-                 n_guard=None, n_damp=30,
-                 exchange_period=None, boundaries='periodic',
-                 gamma_boost=None, use_all_mpi_ranks=True,
-                 particle_shape='linear' ):
+                 v_comoving=None, use_galilean=False, initialize_ions=False,
+                 use_cuda=False, use_threading=True, nthreads=None,
+                 n_guard=None, n_damp=30, exchange_period=None, 
+                 boundaries='periodic', gamma_boost=None, 
+                 use_all_mpi_ranks=True, particle_shape='linear' ):
         """
         Initializes a simulation, by creating the following structures:
 
@@ -132,6 +132,12 @@ def dens_func( z, r ) ...
 
         use_cuda: bool, optional
             Wether to use CUDA (GPU) acceleration
+        use_threading : bool, optional
+            Wether to use multi-threading on the CPU.
+        nthreads: int, optional
+            Number of CPU multi-threading threads used (if use_threading
+            is set). If nthreads is set to None, the number of threads
+            are automatically determined.
 
         n_guard: int, optional
             Number of guard cells to use at the left and right of
@@ -186,13 +192,23 @@ def dens_func( z, r ) ...
             to first order shapes, 'linear_non_atomic' uses an equivalent
             deposition scheme to 'linear' which avoids atomics on the GPU.
         """
-        # Check whether to use cuda
+        # Check whether to use CUDA
         self.use_cuda = use_cuda
         if (use_cuda==True) and (cuda_installed==False):
             print('*** Cuda not available for the simulation.')
             print('*** Performing the simulation on CPU.')
             self.use_cuda = False
-
+        # CPU multi-threading
+        self.use_threading = use_threading
+        if self.use_threading:
+            # Define number of threads used
+            if nthreads is not None:
+                # Automatically take numba preset for number of threads
+                self.nthreads = nthreads
+                numba.config.NUMBA_NUM_THREADS = self.nthreads
+            else:
+                # Set user-defined number of threads
+                self.nthreads = numba.config.NUMBA_NUM_THREADS
         # Register the comoving parameters
         self.v_comoving = v_comoving
         self.use_galilean = use_galilean
@@ -234,19 +250,20 @@ def dens_func( z, r ) ...
         # Initialize the electrons and the ions
         grid_shape = self.fld.interp[0].Ez.shape
         self.ptcl = [
-            Particles( q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin,
-                       zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
-                       Nptheta=p_nt, dt=dt, dens_func=dens_func,
-                       use_cuda=self.use_cuda, uz_m=uz_m,
-                       grid_shape=grid_shape, particle_shape=particle_shape) ]
+            Particles(q=-e, m=m_e, n=n_e, Npz=Npz, zmin=p_zmin,
+                      zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
+                      Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
+                      grid_shape=grid_shape, particle_shape=particle_shape,
+                      use_cuda=self.use_cuda,
+                      use_threading=self.use_threading) ]
         if initialize_ions :
             self.ptcl.append(
                 Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin,
                           zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
-                          Nptheta=p_nt, dt=dt, dens_func=dens_func,
-                          use_cuda=self.use_cuda, uz_m=uz_m,
-                          grid_shape=grid_shape,
-                          particle_shape=particle_shape ) )
+                          Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
+                          grid_shape=grid_shape, particle_shape=particle_shape,
+                          use_cuda=self.use_cuda,
+                          use_threading=self.use_threading) )
 
         # Register the number of particles per cell along z, and dt
         # (Necessary for the moving window)
diff --git a/fbpic/particles/cuda_deposition/__init__.py b/fbpic/particles/cuda_deposition/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/fbpic/particles/cuda_deposition/cubic.py b/fbpic/particles/cuda_deposition/cubic.py
deleted file mode 100644
index 45e15e0eb..000000000
--- a/fbpic/particles/cuda_deposition/cubic.py
+++ /dev/null
@@ -1,1214 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for cubic order shapes
-"""
-from numba import cuda, int64
-import math
-from scipy.constants import c
-import numpy as np
-
-# Shape Factor helper functions to compute particle shapes.
-
-
-@cuda.jit(device=True, inline=True)
-def z_shape(cell_position, index):
-    iz = int64(math.floor(cell_position)) - 1
-    if index == 0:
-        return (-1./6.)*((cell_position-iz)-2)**3
-    if index == 1:
-        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
-    if index == 2:
-        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
-    if index == 3:
-        return (-1./6.)*(((iz+3)-cell_position)-2)**3
-
-
-@cuda.jit(device=True, inline=True)
-def r_shape(cell_position, index):
-    flip_factor = 1.
-    ir = int64(math.floor(cell_position)) - 1
-    if index == 0:
-        if ir < 0:
-            flip_factor = -1.
-        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
-    if index == 1:
-        if ir+1 < 0:
-            flip_factor = -1.
-        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
-    if index == 2:
-        if ir+2 < 0:
-            flip_factor = -1.
-        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
-    if index == 3:
-        if ir+3 < 0:
-            flip_factor = -1.
-        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
-
-# -------------------------------
-# Field deposition utility - rho
-# -------------------------------
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                int32[:], int32[:])')
-def deposit_rho_gpu_cubic(x, y, z, w,
-                          invdz, zmin, Nz,
-                          invdr, rmin, Nr,
-                          rho_m0, rho_m1,
-                          cell_idx, prefix_sum):
-    """
-    Deposition of the charge density rho using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of rho that is deposited to the
-    16 cells surounding the particle based on its shape (cubic).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 16 variables (one for each surrounding cell) to
-    maintain parallelism while avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i / Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare local field arrays
-        R_m0_00 = 0.
-        R_m1_00 = 0. + 0.j
-
-        R_m0_01 = 0.
-        R_m1_01 = 0. + 0.j
-
-        R_m0_02 = 0.
-        R_m1_02 = 0. + 0.j
-
-        R_m0_03 = 0.
-        R_m1_03 = 0. + 0.j
-
-        R_m0_10 = 0.
-        R_m1_10 = 0. + 0.j
-
-        R_m0_11 = 0.
-        R_m1_11 = 0. + 0.j
-
-        R_m0_12 = 0.
-        R_m1_12 = 0. + 0.j
-
-        R_m0_13 = 0.
-        R_m1_13 = 0. + 0.j
-
-        R_m0_20 = 0.
-        R_m1_20 = 0. + 0.j
-
-        R_m0_21 = 0.
-        R_m1_21 = 0. + 0.j
-
-        R_m0_22 = 0.
-        R_m1_22 = 0. + 0.j
-
-        R_m0_23 = 0.
-        R_m1_23 = 0. + 0.j
-
-        R_m0_30 = 0.
-        R_m1_30 = 0. + 0.j
-
-        R_m0_31 = 0.
-        R_m1_31 = 0. + 0.j
-
-        R_m0_32 = 0.
-        R_m1_32 = 0. + 0.j
-
-        R_m0_33 = 0.
-        R_m1_33 = 0. + 0.j
-
-        for j in range(frequency_per_cell):
-            # Get the particle index before the sorting
-            # --------------------------------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0_scal = wj * exptheta_m0
-            # Mode 1
-            R_m1_scal = wj * exptheta_m1
-            # Compute values in local copies and consider boundaries
-            ir0 = int64(math.floor(r_cell)) - 1
-
-            if (ir0 == -2):
-                R_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal
-
-            if (ir0 == -1):
-                R_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal
-            if (ir0 >= 0):
-                R_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*R_m1_scal
-
-                R_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m0_scal
-                R_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*R_m1_scal
-                R_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m0_scal
-                R_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*R_m1_scal
-                R_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m0_scal
-                R_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*R_m1_scal
-                R_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m0_scal
-                R_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*R_m1_scal
-
-        # Index Shifting since local copies are centered around
-        # the current cell
-        srl = 0         # shift r lower
-        sru = 0         # shift r upper inner
-        sru2 = 0        # shift r upper outer
-        szl = 0         # shift z lower
-        szu = 0         # shift z upper inner
-        szu2 = 0        # shift z upper outer
-        if (iz_cell-1) < 0:
-            szl += Nz
-        if (iz_cell) == (Nz - 1):
-            szu -= Nz
-            szu2 -= Nz
-        if (iz_cell+1) == (Nz - 1):
-            szu2 -= Nz
-        if (ir_cell) >= (Nr - 1):
-            sru = -1
-            sru2 = -2
-        if (ir_cell+1) == (Nr - 1):
-            sru2 = -1
-        if (ir_cell-1) < 0:
-            srl = 1
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag)
-
-
-# -------------------------------
-# Field deposition utility - J
-# -------------------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:],\
-                int32[:], int32[:])')
-def deposit_J_gpu_cubic(x, y, z, w,
-                        ux, uy, uz, inv_gamma,
-                        invdz, zmin, Nz,
-                        invdr, rmin, Nr,
-                        j_r_m0, j_r_m1,
-                        j_t_m0, j_t_m1,
-                        j_z_m0, j_z_m1,
-                        cell_idx, prefix_sum):
-    """
-    Deposition of the current J using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of J that is deposited to the
-    16 cells surounding the particle based on its shape (cubic).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 16 variables (one for each cell) to maintain
-    parallelism while avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i/Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare the local field value for
-        # all possible deposition directions,
-        # depending on the shape order and per mode for r,t and z.
-        J_r_m0_00 = 0.
-        J_t_m0_00 = 0.
-        J_z_m0_00 = 0.
-        J_r_m1_00 = 0. + 0.j
-        J_t_m1_00 = 0. + 0.j
-        J_z_m1_00 = 0. + 0.j
-
-        J_r_m0_01 = 0.
-        J_t_m0_01 = 0.
-        J_z_m0_01 = 0.
-        J_r_m1_01 = 0. + 0.j
-        J_t_m1_01 = 0. + 0.j
-        J_z_m1_01 = 0. + 0.j
-
-        J_r_m0_02 = 0.
-        J_t_m0_02 = 0.
-        J_z_m0_02 = 0.
-        J_r_m1_02 = 0. + 0.j
-        J_t_m1_02 = 0. + 0.j
-        J_z_m1_02 = 0. + 0.j
-
-        J_r_m0_03 = 0.
-        J_t_m0_03 = 0.
-        J_z_m0_03 = 0.
-        J_r_m1_03 = 0. + 0.j
-        J_t_m1_03 = 0. + 0.j
-        J_z_m1_03 = 0. + 0.j
-
-        J_r_m0_10 = 0.
-        J_t_m0_10 = 0.
-        J_z_m0_10 = 0.
-        J_r_m1_10 = 0. + 0.j
-        J_t_m1_10 = 0. + 0.j
-        J_z_m1_10 = 0. + 0.j
-
-        J_r_m0_11 = 0.
-        J_t_m0_11 = 0.
-        J_z_m0_11 = 0.
-        J_r_m1_11 = 0. + 0.j
-        J_t_m1_11 = 0. + 0.j
-        J_z_m1_11 = 0. + 0.j
-
-        J_r_m0_12 = 0.
-        J_t_m0_12 = 0.
-        J_z_m0_12 = 0.
-        J_r_m1_12 = 0. + 0.j
-        J_t_m1_12 = 0. + 0.j
-        J_z_m1_12 = 0. + 0.j
-
-        J_r_m0_13 = 0.
-        J_t_m0_13 = 0.
-        J_z_m0_13 = 0.
-        J_r_m1_13 = 0. + 0.j
-        J_t_m1_13 = 0. + 0.j
-        J_z_m1_13 = 0. + 0.j
-
-        J_r_m0_20 = 0.
-        J_t_m0_20 = 0.
-        J_z_m0_20 = 0.
-        J_r_m1_20 = 0. + 0.j
-        J_t_m1_20 = 0. + 0.j
-        J_z_m1_20 = 0. + 0.j
-
-        J_r_m0_21 = 0.
-        J_t_m0_21 = 0.
-        J_z_m0_21 = 0.
-        J_r_m1_21 = 0. + 0.j
-        J_t_m1_21 = 0. + 0.j
-        J_z_m1_21 = 0. + 0.j
-
-        J_r_m0_22 = 0.
-        J_t_m0_22 = 0.
-        J_z_m0_22 = 0.
-        J_r_m1_22 = 0. + 0.j
-        J_t_m1_22 = 0. + 0.j
-        J_z_m1_22 = 0. + 0.j
-
-        J_r_m0_23 = 0.
-        J_t_m0_23 = 0.
-        J_z_m0_23 = 0.
-        J_r_m1_23 = 0. + 0.j
-        J_t_m1_23 = 0. + 0.j
-        J_z_m1_23 = 0. + 0.j
-
-        J_r_m0_30 = 0.
-        J_t_m0_30 = 0.
-        J_z_m0_30 = 0.
-        J_r_m1_30 = 0. + 0.j
-        J_t_m1_30 = 0. + 0.j
-        J_z_m1_30 = 0. + 0.j
-
-        J_r_m0_31 = 0.
-        J_t_m0_31 = 0.
-        J_z_m0_31 = 0.
-        J_r_m1_31 = 0. + 0.j
-        J_t_m1_31 = 0. + 0.j
-        J_z_m1_31 = 0. + 0.j
-
-        J_r_m0_32 = 0.
-        J_t_m0_32 = 0.
-        J_z_m0_32 = 0.
-        J_r_m1_32 = 0. + 0.j
-        J_t_m1_32 = 0. + 0.j
-        J_z_m1_32 = 0. + 0.j
-
-        J_r_m0_33 = 0.
-        J_t_m0_33 = 0.
-        J_z_m0_33 = 0.
-        J_r_m1_33 = 0. + 0.j
-        J_t_m1_33 = 0. + 0.j
-        J_z_m1_33 = 0. + 0.j
-
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index
-            # ----------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Get weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
-            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
-            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
-            # Mode 1
-            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
-            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
-            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
-
-            # Compute values in local copies and consider boundaries
-            ir0 = int64(math.floor(r_cell)) - 1
-
-            if (ir0 == -2):
-                J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal
-                J_r_m0_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-
-                J_t_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_z_m1_20 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal
-            if (ir0 == -1):
-                J_r_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal
-            if (ir0 >= 0):
-                J_r_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m0_scal
-                J_r_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_r_m1_scal
-                J_r_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m0_scal
-                J_r_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_r_m1_scal
-                J_r_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m0_scal
-                J_r_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_r_m1_scal
-                J_r_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m0_scal
-                J_r_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_r_m1_scal
-
-                J_t_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m0_scal
-                J_t_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_t_m1_scal
-                J_t_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m0_scal
-                J_t_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_t_m1_scal
-                J_t_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m0_scal
-                J_t_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_t_m1_scal
-                J_t_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m0_scal
-                J_t_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_t_m1_scal
-
-                J_z_m0_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_00 += r_shape(r_cell, 0)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_01 += r_shape(r_cell, 0)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_02 += r_shape(r_cell, 0)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_03 += r_shape(r_cell, 0)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape(r_cell, 1)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape(r_cell, 1)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape(r_cell, 1)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape(r_cell, 1)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape(r_cell, 2)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape(r_cell, 2)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape(r_cell, 2)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape(r_cell, 2)*z_shape(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m0_scal
-                J_z_m1_30 += r_shape(r_cell, 3)*z_shape(z_cell, 0)*J_z_m1_scal
-                J_z_m0_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m0_scal
-                J_z_m1_31 += r_shape(r_cell, 3)*z_shape(z_cell, 1)*J_z_m1_scal
-                J_z_m0_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m0_scal
-                J_z_m1_32 += r_shape(r_cell, 3)*z_shape(z_cell, 2)*J_z_m1_scal
-                J_z_m0_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m0_scal
-                J_z_m1_33 += r_shape(r_cell, 3)*z_shape(z_cell, 3)*J_z_m1_scal
-
-        # Index Shifting since local copies are centered around
-        # the current cell
-        srl = 0         # shift r lower
-        sru = 0         # shift r upper inner
-        sru2 = 0        # shift r upper outer
-        szl = 0         # shift z lower
-        szu = 0         # shift z upper inner
-        szu2 = 0        # shift z upper outer
-        if (iz_cell-1) < 0:
-            szl += Nz
-        if (iz_cell) == (Nz - 1):
-            szu -= Nz
-            szu2 -= Nz
-        if (iz_cell+1) == (Nz - 1):
-            szu2 -= Nz
-        if (ir_cell) >= (Nr - 1):
-            sru = -1
-            sru2 = -2
-        if (ir_cell+1) == (Nr - 1):
-            sru2 = -1
-        if (ir_cell-1) < 0:
-            srl = 1
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30)
-            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33)
-            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30)
-            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33)
-            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag)
-
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30)
-            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33)
-            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag)
diff --git a/fbpic/particles/cuda_deposition/linear.py b/fbpic/particles/cuda_deposition/linear.py
deleted file mode 100644
index 48adff845..000000000
--- a/fbpic/particles/cuda_deposition/linear.py
+++ /dev/null
@@ -1,482 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for linear order shapes
-"""
-from numba import cuda, int64
-import math
-from scipy.constants import c
-import numpy as np
-
-
-# Shape Factor functions to compute particle shapes.
-@cuda.jit(device=True, inline=True)
-def get_z_shape_linear(cell_position, index):
-    iz = int64(math.floor(cell_position))
-    if index == 0:
-        return iz+1.-cell_position
-    if index == 1:
-        return cell_position - iz
-
-
-@cuda.jit(device=True, inline=True)
-def get_r_shape_linear(cell_position, index):
-    flip_factor = 1.
-    ir = int64(math.floor(cell_position))
-    if index == 0:
-        if ir < 0:
-            flip_factor = -1.
-        return flip_factor*(ir+1.-cell_position)
-    if index == 1:
-        return flip_factor*(cell_position - ir)
-
-
-# -------------------------------
-# Field deposition utility - rho
-# -------------------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                int32[:], int32[:])')
-def deposit_rho_gpu_linear(x, y, z, w,
-                           invdz, zmin, Nz,
-                           invdr, rmin, Nr,
-                           rho_m0, rho_m1,
-                           cell_idx, prefix_sum):
-    """
-    Deposition of the charge density rho using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of rho that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 variables (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i / Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare local field arrays
-        R_m0_00 = 0.
-        R_m0_01 = 0.
-        R_m0_10 = 0.
-        R_m0_11 = 0.
-
-        R_m1_00 = 0. + 0.j
-        R_m1_01 = 0. + 0.j
-        R_m1_10 = 0. + 0.j
-        R_m1_11 = 0. + 0.j
-
-        for j in range(frequency_per_cell):
-            # Get the particle index before the sorting
-            # --------------------------------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0_scal = wj * exptheta_m0
-            # Mode 1
-            R_m1_scal = wj * exptheta_m1
-
-            # Boundary Region Shifts
-            ir_lower = int64(math.floor(r_cell))
-
-            R_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m0_scal
-            R_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m0_scal
-            R_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * R_m1_scal
-            R_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * R_m1_scal
-
-            if ir_lower == -1:
-                R_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal
-            else:
-                R_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * R_m1_scal
-
-        # Cell shifts for the simulation boundaries
-        shift_r = 0
-        shift_z = 0
-        if ir_cell+1 > (Nr-1):
-            shift_r = -1
-        if iz_cell+1 > Nz-1:
-            shift_z -= Nz
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag)
-
-            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real)
-            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real)
-            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag)
-
-
-# -------------------------------
-# Field deposition utility - J
-# -------------------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:],\
-                int32[:], int32[:])')
-def deposit_J_gpu_linear(x, y, z, w,
-                         ux, uy, uz, inv_gamma,
-                         invdz, zmin, Nz,
-                         invdr, rmin, Nr,
-                         j_r_m0, j_r_m1,
-                         j_t_m0, j_t_m1,
-                         j_z_m0, j_z_m1,
-                         cell_idx, prefix_sum):
-    """
-    Deposition of the current J using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of J that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 variables (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz_cell = int(i/Nr)
-        ir_cell = int(i - iz_cell * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-
-        # Declare the local field value for
-        # all possible deposition directions,
-        # depending on the shape order and per mode for r,t and z.
-
-        J_r_m0_00 = 0.
-        J_r_m1_00 = 0. + 0.j
-        J_t_m0_00 = 0.# + 0.j
-        J_t_m1_00 = 0. + 0.j
-        J_z_m0_00 = 0.
-        J_z_m1_00 = 0. + 0.j
-
-        J_r_m0_01 = 0.
-        J_r_m1_01 = 0. + 0.j
-        J_t_m0_01 = 0.
-        J_t_m1_01 = 0. + 0.j
-        J_z_m0_01 = 0.
-        J_z_m1_01 = 0. + 0.j
-
-        J_r_m0_10 = 0.
-        J_r_m1_10 = 0. + 0.j
-        J_t_m0_10 = 0.
-        J_t_m1_10 = 0. + 0.j
-        J_z_m0_10 = 0.
-        J_z_m1_10 = 0. + 0.j
-
-        J_r_m0_11 = 0.
-        J_r_m1_11 = 0. + 0.j
-        J_t_m0_11 = 0.
-        J_t_m1_11 = 0. + 0.j
-        J_z_m0_11 = 0.
-        J_z_m1_11 = 0. + 0.j
-
-
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index
-            # ----------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset-1-j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Get weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
-            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
-            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
-            # Mode 1
-            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
-            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
-            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
-
-            # Deposit on local copies at respective position
-            ir_lower = int64(math.floor(r_cell))
-
-            J_r_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal
-            J_t_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal
-            J_z_m0_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal
-            J_r_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal
-            J_t_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal
-            J_z_m0_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal
-            J_r_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal
-            J_t_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal
-            J_z_m1_00 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal
-            J_r_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal
-            J_t_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal
-            J_z_m1_01 += get_r_shape_linear(r_cell, 0)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal
-
-            # Take into account lower r flips
-            if ir_lower == -1:
-                J_r_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_00 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_01 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal
-            else:
-                J_r_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_10 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_11 += get_r_shape_linear(r_cell, 1)*get_z_shape_linear(z_cell, 1) * J_z_m1_scal
-
-        # Cell shifts for the simulation boundaries
-        shift_r = 0
-        shift_z = 0
-        if (ir_cell+1) > (Nr-1):
-            shift_r = -1
-        if (iz_cell+1) > Nz-1:
-            shift_z -= Nz
-
-        # Atomically add the registers to global memory
-        if frequency_per_cell > 0:
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag)
-
-            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real)
-            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real)
-            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag)
-
-            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real)
-            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real)
-            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag)
-
-            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real)
-            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real)
-            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag)
diff --git a/fbpic/particles/cuda_deposition/linear_non_atomic.py b/fbpic/particles/cuda_deposition/linear_non_atomic.py
deleted file mode 100644
index 6dea16d21..000000000
--- a/fbpic/particles/cuda_deposition/linear_non_atomic.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for linear order shapes
-without using atomic operations.
-"""
-from numba import cuda
-import math
-from scipy.constants import c
-import numpy as np
-
-# -------------------------------
-# Field deposition utility - rho
-# -------------------------------
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:],\
-                int32[:], int32[:])')
-def deposit_rho_gpu(x, y, z, w,
-                    invdz, zmin, Nz,
-                    invdr, rmin, Nr,
-                    rho0, rho1,
-                    rho2, rho3,
-                    cell_idx, prefix_sum):
-    """
-    Deposition of the charge density rho using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of rho that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 arrays (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho0, rho1, rho2, rho3 : 3darray of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz = int(i / Nr)
-        ir = int(i - iz * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-        # Initialize the local field value for
-        # all four possible deposition directions
-        # Mode 0, 1 for r, t, z
-        # 1 : lower in r, lower in z
-        # 2 : lower in r, upper in z
-        # 3 : upper in r, lower in z
-        # 4 : upper in r, upper in z
-        R1_m0 = 0. + 0.j
-        R2_m0 = 0. + 0.j
-        R3_m0 = 0. + 0.j
-        R4_m0 = 0. + 0.j
-        # ------------
-        R1_m1 = 0. + 0.j
-        R2_m1 = 0. + 0.j
-        R3_m1 = 0. + 0.j
-        R4_m1 = 0. + 0.j
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index before the sorting
-            # --------------------------------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset - 1 - j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1. / rj
-                cos = xj * invr  # Cosine
-                sin = yj * invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j * sin
-
-            # Get linear weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr * (rj - rmin) - 0.5
-            z_cell = invdz * (zj - zmin) - 0.5
-            # Original index of the uppper and lower cell
-            ir_lower = int(math.floor(r_cell))
-            ir_upper = ir_lower + 1
-            iz_lower = int(math.floor(z_cell))
-            iz_upper = iz_lower + 1
-            # Linear weight
-            Sr_lower = ir_upper - r_cell
-            Sr_upper = r_cell - ir_lower
-            Sz_lower = iz_upper - z_cell
-            Sz_upper = z_cell - iz_lower
-            # Set guard weights to zero
-            Sr_guard = 0.
-
-            # Treat the boundary conditions
-            # --------------------------------------------
-            # guard cells in lower r
-            if ir_lower < 0:
-                Sr_guard = Sr_lower
-                Sr_lower = 0.
-                ir_lower = 0
-            # absorbing in upper r
-            if ir_lower > Nr - 1:
-                ir_lower = Nr - 1
-            if ir_upper > Nr - 1:
-                ir_upper = Nr - 1
-            # periodic boundaries in z
-            # lower z boundaries
-            if iz_lower < 0:
-                iz_lower += Nz
-            if iz_upper < 0:
-                iz_upper += Nz
-            # upper z boundaries
-            if iz_lower > Nz - 1:
-                iz_lower -= Nz
-            if iz_upper > Nz - 1:
-                iz_upper -= Nz
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0 = wj * exptheta_m0
-            # Mode 1
-            R_m1 = wj * exptheta_m1
-
-            # Caculate the weighted currents for each
-            # of the four possible direction
-            # --------------------------------------------
-            if ir_lower == ir_upper:
-                # In the case that ir_lower and ir_upper are equal,
-                # the current is added only to the array corresponding
-                # to ir_lower.
-                # (This is the case for the boundaries in r)
-                R1_m0 += Sz_lower * Sr_lower * R_m0
-                R1_m0 += Sz_lower * Sr_upper * R_m0
-                R3_m0 += Sz_upper * Sr_lower * R_m0
-                R3_m0 += Sz_upper * Sr_upper * R_m0
-                # -----------------------------
-                R1_m1 += Sz_lower * Sr_lower * R_m1
-                R1_m1 += Sz_lower * Sr_upper * R_m1
-                R3_m1 += Sz_upper * Sr_lower * R_m1
-                R3_m1 += Sz_upper * Sr_upper * R_m1
-                # -----------------------------
-            if ir_lower != ir_upper:
-                # In the case that ir_lower and ir_upper are different,
-                # add the current to the four arrays according to
-                # the direction.
-                R1_m0 += Sz_lower * Sr_lower * R_m0
-                R2_m0 += Sz_lower * Sr_upper * R_m0
-                R3_m0 += Sz_upper * Sr_lower * R_m0
-                R4_m0 += Sz_upper * Sr_upper * R_m0
-                # -----------------------------
-                R1_m1 += Sz_lower * Sr_lower * R_m1
-                R2_m1 += Sz_lower * Sr_upper * R_m1
-                R3_m1 += Sz_upper * Sr_lower * R_m1
-                R4_m1 += Sz_upper * Sr_upper * R_m1
-                # -----------------------------
-            if ir_lower == ir_upper == 0:
-                # Treat the guard cells.
-                # Add the current to the guard cells
-                # for particles that had an original
-                # cell index < 0.
-                R1_m0 += -1. * Sz_lower * Sr_guard * R_m0
-                R3_m0 += -1. * Sz_upper * Sr_guard * R_m0
-                # ---------------------------------
-                R1_m1 += -1. * Sz_lower * Sr_guard * R_m1
-                R3_m1 += -1. * Sz_upper * Sr_guard * R_m1
-        # Write the calculated field values to
-        # the field arrays defined on the interpolation grid
-        rho0[iz, ir, 0] = R1_m0
-        rho0[iz, ir, 1] = R1_m1
-        rho1[iz, ir, 0] = R2_m0
-        rho1[iz, ir, 1] = R2_m1
-        rho2[iz, ir, 0] = R3_m0
-        rho2[iz, ir, 1] = R3_m1
-        rho3[iz, ir, 0] = R4_m0
-        rho3[iz, ir, 1] = R4_m1
-
-
-@cuda.jit('void(complex128[:,:], complex128[:,:], \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:])')
-def add_rho(rho_m0, rho_m1,
-            rho0, rho1,
-            rho2, rho3):
-    """
-    Merges the 4 separate field arrays that contain rho for
-    each deposition direction and adds them to the global
-    interpolation grid arrays for mode 0 and 1.
-
-    Parameters
-    ----------
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
-
-    rho0, rho1, rho2, rho3 : 3darrays of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes.
-    """
-    # Get the CUDA Grid in 2D
-    i, j = cuda.grid(2)
-    # Only for threads within (nz, nr)
-    if (i < rho_m0.shape[0] and j < rho_m0.shape[1]):
-        # Sum the four field arrays for the different deposition
-        # directions and write them to the global field array
-        rho_m0[i, j] += rho0[i, j, 0] + \
-            rho1[i, j - 1, 0] + \
-            rho2[i - 1, j, 0] + \
-            rho3[i - 1, j - 1, 0]
-
-        rho_m1[i, j] += rho0[i, j, 1] + \
-            rho1[i, j - 1, 1] + \
-            rho2[i - 1, j, 1] + \
-            rho3[i - 1, j - 1, 1]
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-                float64[:], float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32, \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:],\
-                int32[:], int32[:])')
-def deposit_J_gpu(x, y, z, w,
-                  ux, uy, uz, inv_gamma,
-                  invdz, zmin, Nz,
-                  invdr, rmin, Nr,
-                  J0, J1,
-                  J2, J3,
-                  cell_idx, prefix_sum):
-    """
-    Deposition of the current J using numba on the GPU.
-    Iterates over the cells and over the particles per cell.
-    Calculates the weighted amount of J that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The particles are sorted by their cell index (the lower cell
-    in r and z that they deposit to) and the deposited field
-    is split into 4 arrays (one for each possible direction,
-    e.g. upper in z, lower in r) to maintain parallelism while
-    avoiding any race conditions.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    J0, J1, J2, J3 : 3darray of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes and the
-        3 directions of J in cylindrical coordinates (r, t, z).
-        (is mofidied by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    if i < prefix_sum.shape[0]:
-        # Calculate the cell index in 2D from the 1D threadIdx
-        iz = int(i / Nr)
-        ir = int(i - iz * Nr)
-        # Calculate the inclusive offset for the current cell
-        # It represents the number of particles contained in all other cells
-        # with an index smaller than i + the total number of particles in the
-        # current cell (inclusive).
-        incl_offset = np.int32(prefix_sum[i])
-        # Calculate the frequency per cell from the offset and the previous
-        # offset (prefix_sum[i-1]).
-        if i > 0:
-            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
-        if i == 0:
-            frequency_per_cell = np.int32(incl_offset)
-        # Initialize the local field value for
-        # all four possible deposition directions
-        # Mode 0, 1 for r, t, z
-        # 1 : lower in r, lower in z
-        # 2 : lower in r, upper in z
-        # 3 : upper in r, lower in z
-        # 4 : upper in r, upper in z
-        Jr1_m0 = 0. + 0.j
-        Jr2_m0 = 0. + 0.j
-        Jr3_m0 = 0. + 0.j
-        Jr4_m0 = 0. + 0.j
-        # -------------
-        Jr1_m1 = 0. + 0.j
-        Jr2_m1 = 0. + 0.j
-        Jr3_m1 = 0. + 0.j
-        Jr4_m1 = 0. + 0.j
-        # -------------
-        Jt1_m0 = 0. + 0.j
-        Jt2_m0 = 0. + 0.j
-        Jt3_m0 = 0. + 0.j
-        Jt4_m0 = 0. + 0.j
-        # -------------
-        Jt1_m1 = 0. + 0.j
-        Jt2_m1 = 0. + 0.j
-        Jt3_m1 = 0. + 0.j
-        Jt4_m1 = 0. + 0.j
-        # -------------
-        Jz1_m0 = 0. + 0.j
-        Jz2_m0 = 0. + 0.j
-        Jz3_m0 = 0. + 0.j
-        Jz4_m0 = 0. + 0.j
-        # -------------
-        Jz1_m1 = 0. + 0.j
-        Jz2_m1 = 0. + 0.j
-        Jz3_m1 = 0. + 0.j
-        Jz4_m1 = 0. + 0.j
-        # Loop over the number of particles per cell
-        for j in range(frequency_per_cell):
-            # Get the particle index
-            # ----------------------
-            # (Since incl_offset is a cumulative sum of particle number,
-            # and since python index starts at 0, one has to add -1)
-            ptcl_idx = incl_offset - 1 - j
-
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1. / rj
-                cos = xj * invr  # Cosine
-                sin = yj * invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j * sin
-
-            # Get linear weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr * (rj - rmin) - 0.5
-            z_cell = invdz * (zj - zmin) - 0.5
-            # Original index of the uppper and lower cell
-            # in r and z
-            ir_lower = int(math.floor(r_cell))
-            ir_upper = ir_lower + 1
-            iz_lower = int(math.floor(z_cell))
-            iz_upper = iz_lower + 1
-            # Linear weight
-            Sr_lower = ir_upper - r_cell
-            Sr_upper = r_cell - ir_lower
-            Sz_lower = iz_upper - z_cell
-            Sz_upper = z_cell - iz_lower
-            # Set guard weights to zero
-            Sr_guard = 0.
-
-            # Treat the boundary conditions
-            # --------------------------------------------
-            # guard cells in lower r
-            if ir_lower < 0:
-                Sr_guard = Sr_lower
-                Sr_lower = 0.
-                ir_lower = 0
-            # absorbing in upper r
-            if ir_lower > Nr - 1:
-                ir_lower = Nr - 1
-            if ir_upper > Nr - 1:
-                ir_upper = Nr - 1
-            # periodic boundaries in z
-            # lower z boundaries
-            if iz_lower < 0:
-                iz_lower += Nz
-            if iz_upper < 0:
-                iz_upper += Nz
-            # upper z boundaries
-            if iz_lower > Nz - 1:
-                iz_lower -= Nz
-            if iz_upper > Nz - 1:
-                iz_upper -= Nz
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            Jr_m0 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m0
-            Jt_m0 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m0
-            Jz_m0 = wj * c * inv_gammaj * uzj * exptheta_m0
-            # Mode 1
-            Jr_m1 = wj * c * inv_gammaj * (cos * uxj + sin * uyj) * exptheta_m1
-            Jt_m1 = wj * c * inv_gammaj * (cos * uyj - sin * uxj) * exptheta_m1
-            Jz_m1 = wj * c * inv_gammaj * uzj * exptheta_m1
-
-            # Caculate the weighted currents for each
-            # of the four possible direction
-            # --------------------------------------------
-            if ir_lower == ir_upper:
-                # In the case that ir_lower and ir_upper are equal,
-                # the current is added only to the array corresponding
-                # to ir_lower.
-                # (This is the case for the boundaries in r)
-                Jr1_m0 += Sz_lower * Sr_lower * Jr_m0
-                Jr1_m0 += Sz_lower * Sr_upper * Jr_m0
-                Jr3_m0 += Sz_upper * Sr_lower * Jr_m0
-                Jr3_m0 += Sz_upper * Sr_upper * Jr_m0
-                # -------------------------------
-                Jr1_m1 += Sz_lower * Sr_lower * Jr_m1
-                Jr1_m1 += Sz_lower * Sr_upper * Jr_m1
-                Jr3_m1 += Sz_upper * Sr_lower * Jr_m1
-                Jr3_m1 += Sz_upper * Sr_upper * Jr_m1
-                # -------------------------------
-                Jt1_m0 += Sz_lower * Sr_lower * Jt_m0
-                Jt1_m0 += Sz_lower * Sr_upper * Jt_m0
-                Jt3_m0 += Sz_upper * Sr_lower * Jt_m0
-                Jt3_m0 += Sz_upper * Sr_upper * Jt_m0
-                # -------------------------------
-                Jt1_m1 += Sz_lower * Sr_lower * Jt_m1
-                Jt1_m1 += Sz_lower * Sr_upper * Jt_m1
-                Jt3_m1 += Sz_upper * Sr_lower * Jt_m1
-                Jt3_m1 += Sz_upper * Sr_upper * Jt_m1
-                # -------------------------------
-                Jz1_m0 += Sz_lower * Sr_lower * Jz_m0
-                Jz1_m0 += Sz_lower * Sr_upper * Jz_m0
-                Jz3_m0 += Sz_upper * Sr_lower * Jz_m0
-                Jz3_m0 += Sz_upper * Sr_upper * Jz_m0
-                # -------------------------------
-                Jz1_m1 += Sz_lower * Sr_lower * Jz_m1
-                Jz1_m1 += Sz_lower * Sr_upper * Jz_m1
-                Jz3_m1 += Sz_upper * Sr_lower * Jz_m1
-                Jz3_m1 += Sz_upper * Sr_upper * Jz_m1
-                # -------------------------------
-            if ir_lower != ir_upper:
-                # In the case that ir_lower and ir_upper are different,
-                # add the current to the four arrays according to
-                # the direction.
-                Jr1_m0 += Sz_lower * Sr_lower * Jr_m0
-                Jr2_m0 += Sz_lower * Sr_upper * Jr_m0
-                Jr3_m0 += Sz_upper * Sr_lower * Jr_m0
-                Jr4_m0 += Sz_upper * Sr_upper * Jr_m0
-                # -------------------------------
-                Jr1_m1 += Sz_lower * Sr_lower * Jr_m1
-                Jr2_m1 += Sz_lower * Sr_upper * Jr_m1
-                Jr3_m1 += Sz_upper * Sr_lower * Jr_m1
-                Jr4_m1 += Sz_upper * Sr_upper * Jr_m1
-                # -------------------------------
-                Jt1_m0 += Sz_lower * Sr_lower * Jt_m0
-                Jt2_m0 += Sz_lower * Sr_upper * Jt_m0
-                Jt3_m0 += Sz_upper * Sr_lower * Jt_m0
-                Jt4_m0 += Sz_upper * Sr_upper * Jt_m0
-                # -------------------------------
-                Jt1_m1 += Sz_lower * Sr_lower * Jt_m1
-                Jt2_m1 += Sz_lower * Sr_upper * Jt_m1
-                Jt3_m1 += Sz_upper * Sr_lower * Jt_m1
-                Jt4_m1 += Sz_upper * Sr_upper * Jt_m1
-                # -------------------------------
-                Jz1_m0 += Sz_lower * Sr_lower * Jz_m0
-                Jz2_m0 += Sz_lower * Sr_upper * Jz_m0
-                Jz3_m0 += Sz_upper * Sr_lower * Jz_m0
-                Jz4_m0 += Sz_upper * Sr_upper * Jz_m0
-                # -------------------------------
-                Jz1_m1 += Sz_lower * Sr_lower * Jz_m1
-                Jz2_m1 += Sz_lower * Sr_upper * Jz_m1
-                Jz3_m1 += Sz_upper * Sr_lower * Jz_m1
-                Jz4_m1 += Sz_upper * Sr_upper * Jz_m1
-                # -------------------------------
-            if ir_lower == ir_upper == 0:
-                # Treat the guard cells.
-                # Add the current to the guard cells
-                # for particles that had an original
-                # cell index < 0.
-                Jr1_m0 += -1. * Sz_lower * Sr_guard * Jr_m0
-                Jr3_m0 += -1. * Sz_upper * Sr_guard * Jr_m0
-                # -----------------------------------
-                Jr1_m1 += -1. * Sz_lower * Sr_guard * Jr_m1
-                Jr3_m1 += -1. * Sz_upper * Sr_guard * Jr_m1
-                # -----------------------------------
-                Jt1_m0 += -1. * Sz_lower * Sr_guard * Jt_m0
-                Jt3_m0 += -1. * Sz_upper * Sr_guard * Jt_m0
-                # -----------------------------------
-                Jt1_m1 += -1. * Sz_lower * Sr_guard * Jt_m1
-                Jt3_m1 += -1. * Sz_upper * Sr_guard * Jt_m1
-                # -----------------------------------
-                Jz1_m0 += -1. * Sz_lower * Sr_guard * Jz_m0
-                Jz3_m0 += -1. * Sz_upper * Sr_guard * Jz_m0
-                # -----------------------------------
-                Jz1_m1 += -1. * Sz_lower * Sr_guard * Jz_m1
-                Jz3_m1 += -1. * Sz_upper * Sr_guard * Jz_m1
-        # Write the calculated field values to
-        # the field arrays defined on the interpolation grid
-        J0[iz, ir, 0] = Jr1_m0
-        J0[iz, ir, 1] = Jr1_m1
-        J0[iz, ir, 2] = Jt1_m0
-        J0[iz, ir, 3] = Jt1_m1
-        J0[iz, ir, 4] = Jz1_m0
-        J0[iz, ir, 5] = Jz1_m1
-        # --------------------
-        J1[iz, ir, 0] = Jr2_m0
-        J1[iz, ir, 1] = Jr2_m1
-        J1[iz, ir, 2] = Jt2_m0
-        J1[iz, ir, 3] = Jt2_m1
-        J1[iz, ir, 4] = Jz2_m0
-        J1[iz, ir, 5] = Jz2_m1
-        # --------------------
-        J2[iz, ir, 0] = Jr3_m0
-        J2[iz, ir, 1] = Jr3_m1
-        J2[iz, ir, 2] = Jt3_m0
-        J2[iz, ir, 3] = Jt3_m1
-        J2[iz, ir, 4] = Jz3_m0
-        J2[iz, ir, 5] = Jz3_m1
-        # --------------------
-        J3[iz, ir, 0] = Jr4_m0
-        J3[iz, ir, 1] = Jr4_m1
-        J3[iz, ir, 2] = Jt4_m0
-        J3[iz, ir, 3] = Jt4_m1
-        J3[iz, ir, 4] = Jz4_m0
-        J3[iz, ir, 5] = Jz4_m1
-
-
-@cuda.jit('void(complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:], complex128[:,:], \
-                complex128[:,:,:], complex128[:,:,:], \
-                complex128[:,:,:], complex128[:,:,:])')
-def add_J(Jr_m0, Jr_m1,
-          Jt_m0, Jt_m1,
-          Jz_m0, Jz_m1,
-          J0, J1,
-          J2, J3):
-    """
-    Merges the 4 separate field arrays that contain J for
-    each deposition direction and adds them to the global
-    interpolation grid arrays for mode 0 and 1.
-
-    Parameters
-    ----------
-    Jr_m0, Jr_m1, Jt_m0, Jt_m1, Jz_m0, Jz_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    J0, J1, J2, J3 : 3darrays of complexs
-        2d field arrays, one for each of the deposition directions
-        The third dimension contains the two possible modes and
-        the 3 different components of J (r, t, z).
-    """
-    # Get the CUDA Grid in 2D
-    i, j = cuda.grid(2)
-    # Only for threads within (nz, nr)
-    if (i < Jr_m0.shape[0] and j < Jr_m0.shape[1]):
-        # Sum the four field arrays for the different deposition
-        # directions and write them to the global field array
-        Jr_m0[i, j] += J0[i, j, 0] + \
-            J1[i, j - 1, 0] + \
-            J2[i - 1, j, 0] + \
-            J3[i - 1, j - 1, 0]
-
-        Jr_m1[i, j] += J0[i, j, 1] + \
-            J1[i, j - 1, 1] + \
-            J2[i - 1, j, 1] + \
-            J3[i - 1, j - 1, 1]
-
-        Jt_m0[i, j] += J0[i, j, 2] + \
-            J1[i, j - 1, 2] + \
-            J2[i - 1, j, 2] + \
-            J3[i - 1, j - 1, 2]
-
-        Jt_m1[i, j] += J0[i, j, 3] + \
-            J1[i, j - 1, 3] + \
-            J2[i - 1, j, 3] + \
-            J3[i - 1, j - 1, 3]
-
-        Jz_m0[i, j] += J0[i, j, 4] + \
-            J1[i, j - 1, 4] + \
-            J2[i - 1, j, 4] + \
-            J3[i - 1, j - 1, 4]
-
-        Jz_m1[i, j] += J0[i, j, 5] + \
-            J1[i, j - 1, 5] + \
-            J2[i - 1, j, 5] + \
-            J3[i - 1, j - 1, 5]
diff --git a/fbpic/particles/cuda_methods.py b/fbpic/particles/cuda_methods.py
deleted file mode 100644
index dedc1f2e7..000000000
--- a/fbpic/particles/cuda_methods.py
+++ /dev/null
@@ -1,997 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the optimized particles methods that use cuda on a GPU
-"""
-from numba import cuda, float64, int64
-from accelerate.cuda import sorting
-import math
-from scipy.constants import c, e
-import numpy as np
-
-# -----------------------
-# Particle pusher utility
-# -----------------------
-
-@cuda.jit(device=True, inline=True)
-def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
-    Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
-    """
-    Push at single macroparticle, using the Vay pusher
-    """
-    # Get the magnetic rotation vector
-    taux = bconst*Bx
-    tauy = bconst*By
-    tauz = bconst*Bz
-    tau2 = taux**2 + tauy**2 + tauz**2
-
-    # Get the momenta at the half timestep
-    uxp = ux_i + econst*Ex \
-    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
-    uyp = uy_i + econst*Ey \
-    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
-    uzp = uz_i + econst*Ez \
-    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
-    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
-    utau = uxp*taux + uyp*tauy + uzp*tauz
-
-    # Get the new 1./gamma
-    inv_gamma_f = math.sqrt(
-        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
-
-    # Reuse the tau and utau arrays to save memory
-    tx = inv_gamma_f*taux
-    ty = inv_gamma_f*tauy
-    tz = inv_gamma_f*tauz
-    ut = inv_gamma_f*utau
-    s = 1./( 1 + tau2*inv_gamma_f**2 )
-
-    # Get the new u
-    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
-    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
-    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
-
-    return( ux_f, uy_f, uz_f, inv_gamma_f )
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64, float64, int32, float64)')
-def push_p_gpu( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz,
-                q, m, Ntot, dt ) :
-    """
-    Advance the particles' momenta, using cuda on the GPU
-
-    Parameters
-    ----------
-    ux, uy, uz : 1darray of floats
-        The velocity of the particles
-        (is modified by this function)
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-
-    q : float
-        The charge of the particle species
-
-    m : float
-        The mass of the particle species
-
-    Ntot : int
-        The total number of particles
-
-    dt : float
-        The time by which the momenta is advanced
-    """
-    # Set a few constants
-    econst = q*dt/(m*c)
-    bconst = 0.5*q*dt/m
-
-    #Cuda 1D grid
-    ip = cuda.grid(1)
-
-    # Loop over the particles
-    if ip < Ntot:
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
-
-@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64, int32, float64, int16[:])')
-def push_p_ioniz_gpu( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz,
-                m, Ntot, dt, ionization_level ) :
-    """
-    Advance the particles' momenta, using numba on the GPU
-    This take into account that the particles are ionizable, and thus
-    that their charge is determined by `ionization_level`
-
-    Parameters
-    ----------
-    ux, uy, uz : 1darray of floats
-        The velocity of the particles
-        (is modified by this function)
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-
-    m : float
-        The mass of the particle species
-
-    Ntot : int
-        The total number of particles
-
-    dt : float
-        The time by which the momenta is advanced
-
-    ionization_level : 1darray of ints
-        The number of electrons that each ion is missing
-        (compared to a neutral atom)
-    """
-    #Cuda 1D grid
-    ip = cuda.grid(1)
-
-    # Loop over the particles
-    if ip < Ntot:
-        if ionization_level[ip] != 0:
-            # Set a few constants
-            econst = ionization_level[ip] * e * dt/(m*c)
-            bconst = 0.5 * ionization_level[ip] * e * dt/m
-            # Use the Vay pusher
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-                ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-                Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
-
-@cuda.jit('void(float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64)')
-def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) :
-    """
-    Advance the particles' positions over one half-timestep
-
-    This assumes that the positions (x, y, z) are initially either
-    one half-timestep *behind* the momenta (ux, uy, uz), or at the
-    same timestep as the momenta.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-        (is modified by this function)
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    dt : float (seconds)
-        The time by which the position is advanced
-    """
-    # Half timestep, multiplied by c
-    chdt = c*0.5*dt
-
-    i = cuda.grid(1)
-    if i < x.shape[0]:
-        # Particle push
-        inv_g = inv_gamma[i]
-        x[i] += chdt*inv_g*ux[i]
-        y[i] += chdt*inv_g*uy[i]
-        z[i] += chdt*inv_g*uz[i]
-
-# -----------------------
-# Field gathering utility
-# -----------------------
-
-@cuda.jit('void(float64[:], float64[:], float64[:], \
-            float64, float64, int32, \
-            float64, float64, int32, \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:])')
-def gather_field_gpu_linear(x, y, z,
-                    invdz, zmin, Nz,
-                    invdr, rmin, Nr,
-                    Er_m0, Et_m0, Ez_m0,
-                    Er_m1, Et_m1, Ez_m1,
-                    Br_m0, Bt_m0, Bz_m0,
-                    Br_m1, Bt_m1, Bz_m1,
-                    Ex, Ey, Ez,
-                    Bx, By, Bz):
-    """
-    Gathering of the fields (E and B) using numba on the GPU.
-    Iterates over the particles, calculates the weighted amount
-    of fields acting on each particle based on its shape (linear).
-    Fields are gathered in cylindrical coordinates and then
-    transformed to cartesian coordinates.
-    Supports only mode 0 and 1.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box along the
-        direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 0
-
-    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 1
-
-    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 0
-
-    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 1
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-        (is modified by this function)
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-        (is modified by this function)
-    """
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel
-    # (for threads < number of particles)
-    if i < x.shape[0]:
-        # Preliminary arrays for the cylindrical conversion
-        # --------------------------------------------
-        # Position
-        xj = x[i]
-        yj = y[i]
-        zj = z[i]
-
-        # Cylindrical conversion
-        rj = math.sqrt( xj**2 + yj**2 )
-        if (rj !=0. ) :
-            invr = 1./rj
-            cos = xj*invr  # Cosine
-            sin = yj*invr  # Sine
-        else :
-            cos = 1.
-            sin = 0.
-        exptheta_m0 = 1.
-        exptheta_m1 = cos - 1.j*sin
-
-        # Get linear weights for the deposition
-        # --------------------------------------------
-        # Positions of the particles, in the cell unit
-        r_cell =  invdr*(rj - rmin) - 0.5
-        z_cell =  invdz*(zj - zmin) - 0.5
-        # Original index of the uppper and lower cell
-        ir_lower = int(math.floor( r_cell ))
-        ir_upper = ir_lower + 1
-        iz_lower = int(math.floor( z_cell ))
-        iz_upper = iz_lower + 1
-        # Linear weight
-        Sr_lower = ir_upper - r_cell
-        Sr_upper = r_cell - ir_lower
-        Sz_lower = iz_upper - z_cell
-        Sz_upper = z_cell - iz_lower
-        # Set guard weights to zero
-        Sr_guard = 0.
-
-        # Treat the boundary conditions
-        # --------------------------------------------
-        # guard cells in lower r
-        if ir_lower < 0:
-            Sr_guard = Sr_lower
-            Sr_lower = 0.
-            ir_lower = 0
-        # absorbing in upper r
-        if ir_lower > Nr-1:
-            ir_lower = Nr-1
-        if ir_upper > Nr-1:
-            ir_upper = Nr-1
-        # periodic boundaries in z
-        # lower z boundaries
-        if iz_lower < 0:
-            iz_lower += Nz
-        if iz_upper < 0:
-            iz_upper += Nz
-        # upper z boundaries
-        if iz_lower > Nz-1:
-            iz_lower -= Nz
-        if iz_upper > Nz-1:
-            iz_upper -= Nz
-
-        #Precalculate Shapes
-        S_ll = Sz_lower*Sr_lower
-        S_lu = Sz_lower*Sr_upper
-        S_ul = Sz_upper*Sr_lower
-        S_uu = Sz_upper*Sr_upper
-        S_lg = Sz_lower*Sr_guard
-        S_ug = Sz_upper*Sr_guard
-
-        # E-Field
-        # ----------------------------
-        # Define the initial placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ]
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0]
-            Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0]
-            Fz_m +=  1. * S_lg * Ez_m0[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0]
-            Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0]
-            Fz_m +=  1. * S_ug * Ez_m0[ iz_upper, 0]
-        # Add the fields from the mode 0
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 1
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ]
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m +=  1. * S_lg * Er_m1[ iz_lower, 0]
-            Ft_m +=  1. * S_lg * Et_m1[ iz_lower, 0]
-            Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m +=  1. * S_ug * Er_m1[ iz_upper, 0]
-            Ft_m +=  1. * S_ug * Et_m1[ iz_upper, 0]
-            Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0]
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Ex[i] = cos*Fr - sin*Ft
-        Ey[i] = sin*Fr + cos*Ft
-        Ez[i] = Fz
-
-        # B-Field
-        # ----------------------------
-        # Clear the placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ]
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0]
-            Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0]
-            Fz_m +=  1. * S_lg * Bz_m0[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0]
-            Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0]
-            Fz_m +=  1. * S_ug * Bz_m0[ iz_upper, 0]
-        # Add the fields from the mode 0
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 1
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ]
-
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m +=  1. * S_lg * Br_m1[ iz_lower, 0]
-            Ft_m +=  1. * S_lg * Bt_m1[ iz_lower, 0]
-            Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m +=  1. * S_ug * Br_m1[ iz_upper, 0]
-            Ft_m +=  1. * S_ug * Bt_m1[ iz_upper, 0]
-            Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0]
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Bx[i] = cos*Fr - sin*Ft
-        By[i] = sin*Fr + cos*Ft
-        Bz[i] = Fz
-
-
-@cuda.jit('void(float64[:], float64[:], float64[:], \
-            float64, float64, int32, \
-            float64, float64, int32, \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            complex128[:,:], complex128[:,:], complex128[:,:], \
-            float64[:], float64[:], float64[:], \
-            float64[:], float64[:], float64[:])')
-def gather_field_gpu_cubic(x, y, z,
-                    invdz, zmin, Nz,
-                    invdr, rmin, Nr,
-                    Er_m0, Et_m0, Ez_m0,
-                    Er_m1, Et_m1, Ez_m1,
-                    Br_m0, Bt_m0, Bz_m0,
-                    Br_m1, Bt_m1, Bz_m1,
-                    Ex, Ey, Ez,
-                    Bx, By, Bz):
-    """
-    Gathering of the fields (E and B) using numba on the GPU.
-    Iterates over the particles, calculates the weighted amount
-    of fields acting on each particle based on its shape (cubic).
-    Fields are gathered in cylindrical coordinates and then
-    transformed to cartesian coordinates.
-    Supports only mode 0 and 1.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box along the
-        direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 0
-
-    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 1
-
-    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 0
-
-    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 1
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-        (is modified by this function)
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-        (is modified by this function)
-    """
-
-    # Get the 1D CUDA grid
-    i = cuda.grid(1)
-    # Deposit the field per cell in parallel
-    # (for threads < number of particles)
-    if i < x.shape[0]:
-        # Preliminary arrays for the cylindrical conversion
-        # --------------------------------------------
-        # Position
-        xj = x[i]
-        yj = y[i]
-        zj = z[i]
-
-        # Cylindrical conversion
-        rj = math.sqrt(xj**2 + yj**2)
-        if (rj != 0.):
-            invr = 1./rj
-            cos = xj*invr  # Cosine
-            sin = yj*invr  # Sine
-        else:
-            cos = 1.
-            sin = 0.
-        exptheta_m0 = 1.
-        exptheta_m1 = cos - 1.j*sin
-
-        # Get weights for the deposition
-        # --------------------------------------------
-        # Positions of the particle, in the cell unit
-        r_cell = invdr*(rj - rmin) - 0.5
-        z_cell = invdz*(zj - zmin) - 0.5
-
-        # Calculate the shape factors
-        Sr = cuda.local.array((4,), dtype=float64)
-        ir = cuda.local.array((4,), dtype=int64)
-        ir[0] = int64(math.floor(r_cell)) - 1
-        ir[1] = ir[0] + 1
-        ir[2] = ir[1] + 1
-        ir[3] = ir[2] + 1
-        Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3
-        Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
-        Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
-        Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
-        iz = cuda.local.array((4,), dtype=int64)
-        Sz = cuda.local.array((4,), dtype=float64)
-        iz[0] = int64(math.floor(z_cell)) - 1
-        iz[1] = iz[0] + 1
-        iz[2] = iz[1] + 1
-        iz[3] = iz[2] + 1
-        Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3
-        Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4)
-        Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
-        Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
-        # Lower and upper periodic boundary for z
-        for index_z in range(4):
-            if iz[index_z] < 0:
-                iz[index_z] += Nz
-            if iz[index_z] > Nz - 1:
-                iz[index_z] -= Nz
-        # Lower and upper boundary for r
-        for index_r in range(4):
-            if ir[index_r] < 0:
-                ir[index_r] = abs(ir[index_r])-1
-                Sr[index_r] = (-1.)*Sr[index_r]
-            if ir[index_r] > Nr - 1:
-                ir[index_r] = Nr - 1
-
-        # E-Field
-        # ----------------------------
-        # Define the initial placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        for index_r in range(4):
-            for index_z in range(4):
-                Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
-                Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Ez_m0[iz[index_z], ir[index_r]]
-                else:
-                    Fz_m += Sz[index_z]*Sr[index_r]* \
-                        Ez_m0[iz[index_z], ir[index_r]]
-
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 1
-        for index_r in range(4):
-            for index_z in range(4):
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                                Er_m1[iz[index_z], ir[index_r]]
-                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                                Et_m1[iz[index_z], ir[index_r]]
-                else:
-                    Fr_m += Sz[index_z]*Sr[index_r]* \
-                                Er_m1[iz[index_z], ir[index_r]]
-                    Ft_m += Sz[index_z]*Sr[index_r]* \
-                                Et_m1[iz[index_z], ir[index_r]]
-                Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
-
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Ex[i] = (cos*Fr - sin*Ft)
-        Ey[i] = (sin*Fr + cos*Ft)
-        Ez[i] = Fz
-
-        # B-Field
-        # ----------------------------
-        # Clear the placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        for index_r in range(4):
-            for index_z in range(4):
-                Fr_m += Sz[index_z]*Sr[index_r]* \
-                    Br_m0[iz[index_z], ir[index_r]]
-                Ft_m += Sz[index_z]*Sr[index_r]* \
-                    Bt_m0[iz[index_z], ir[index_r]]
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Bz_m0[iz[index_z], ir[index_r]]
-                else:
-                    Fz_m += Sz[index_z]*Sr[index_r]* \
-                        Bz_m0[iz[index_z], ir[index_r]]
-
-        # Add the fields from the mode 0
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-
-        # Add the fields for mode 1
-        for index_r in range(4):
-            for index_z in range(4):
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Br_m1[iz[index_z], ir[index_r]]
-                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Bt_m1[iz[index_z], ir[index_r]]
-                else:
-                    Fr_m += Sz[index_z]*Sr[index_r]* \
-                        Br_m1[iz[index_z], ir[index_r]]
-                    Ft_m += Sz[index_z]*Sr[index_r]* \
-                        Bt_m1[iz[index_z], ir[index_r]]
-                Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
-
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Bx[i] = cos*Fr - sin*Ft
-        By[i] = sin*Fr + cos*Ft
-        Bz[i] = Fz
-
-# -----------------------------------------------------
-# Sorting utilities - get_cell_idx / sort / prefix_sum
-# -----------------------------------------------------
-
-@cuda.jit('void(int32[:], uint32[:], \
-                float64[:], float64[:], float64[:], \
-                float64, float64, int32, \
-                float64, float64, int32)')
-def get_cell_idx_per_particle(cell_idx, sorted_idx,
-                              x, y, z,
-                              invdz, zmin, Nz,
-                              invdr, rmin, Nr):
-    """
-    Get the cell index of each particle.
-    The cell index is 1d and calculated by:
-    cell index in z + cell index in r * number of cells in z.
-    The cell_idx of a particle is defined by
-    the lower cell in r and z, that it deposits its field to.
-
-    Parameters
-    ----------
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    sorted_idx : 1darray of integers
-        The sorted index array needs to be reset
-        before doing the sort
-
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box, in each direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-    """
-    i = cuda.grid(1)
-    if i < cell_idx.shape[0]:
-            # Preliminary arrays for the cylindrical conversion
-            xj = x[i]
-            yj = y[i]
-            zj = z[i]
-            rj = math.sqrt( xj**2 + yj**2 )
-
-            # Positions of the particles, in the cell unit
-            r_cell =  invdr*(rj - rmin) - 0.5
-            z_cell =  invdz*(zj - zmin) - 0.5
-
-            # Original index of the uppper and lower cell
-            ir_lower = int(math.floor( r_cell ))
-            iz_lower = int(math.floor( z_cell ))
-
-            # Treat the boundary conditions
-            # guard cells in lower r
-            if ir_lower < 0:
-                ir_lower = 0
-            # absorbing in upper r
-            if ir_lower > Nr-1:
-                ir_lower = Nr-1
-            # periodic boundaries in z
-            if iz_lower < 0:
-                iz_lower += Nz
-            if iz_lower > Nz-1:
-                iz_lower -= Nz
-
-            # Reset sorted_idx array
-            sorted_idx[i] = i
-            # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr
-            cell_idx[i] = ir_lower + iz_lower * Nr
-
-def sort_particles_per_cell(cell_idx, sorted_idx):
-    """
-    Sort the cell index of the particles and
-    modify the sorted index array accordingly.
-
-    Parameters
-    ----------
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    sorted_idx : 1darray of integers
-        Represents the original index of the
-        particle before the sorting.
-    """
-    Ntot = cell_idx.shape[0]
-    if Ntot > 0:
-        sorter = sorting.RadixSort(Ntot, dtype = np.int32)
-        sorter.sort(cell_idx, vals = sorted_idx)
-
-@cuda.jit('void(int32[:], int32[:])')
-def incl_prefix_sum(cell_idx, prefix_sum):
-    """
-    Perform an inclusive parallel prefix sum on the sorted
-    cell index array. The prefix sum array represents the
-    cumulative sum of the number of particles per cell
-    for each cell index.
-
-    Parameters
-    ----------
-    cell_idx : 1darray of integers
-        The cell index of the particle
-
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    # i is the index of the macroparticle
-    i = cuda.grid(1)
-    if i < cell_idx.shape[0]-1:
-        # ci: index of the cell of the present macroparticle
-        ci = cell_idx[i]
-        # ci_next: index of the cell of the next macroparticle
-        ci_next = cell_idx[i+1]
-        # Fill all the cells between ci and ci_next with the
-        # inclusive cumulative sum of the number particles until ci
-        while ci < ci_next:
-            # The cumulative sum of the number of particle per cell
-            # until ci is i+1 (since i obeys python index, starting at 0)
-            prefix_sum[ci] = i+1
-            ci += 1
-    # The last "macroparticle" of the cell_idx array fills up the
-    # rest of the prefix sum array
-    if i == cell_idx.shape[0]-1:
-        # Get the cell_index of the last macroparticle
-        ci = cell_idx[i]
-        # Fill all the remaining entries of the prefix sum array
-        for empty_index in range(ci, prefix_sum.shape[0]):
-            prefix_sum[empty_index] = i+1
-
-@cuda.jit('void(int32[:])')
-def reset_prefix_sum(prefix_sum):
-    """
-    Resets the prefix sum. Sets all the values
-    to zero.
-
-    Parameters
-    ----------
-    prefix_sum : 1darray of integers
-        Represents the cumulative sum of
-        the particles per cell
-    """
-    i = cuda.grid(1)
-    if i < prefix_sum.shape[0]:
-        prefix_sum[i] = 0
-
-@cuda.jit('void(uint32[:], float64[:], float64[:])')
-def write_sorting_buffer(sorted_idx, val, buf):
-    """
-    Writes the values of a particle array to a buffer,
-    while rearranging them to match the sorted cell index array.
-
-    Parameters
-    ----------
-    sorted_idx : 1darray of integers
-        Represents the original index of the
-        particle before the sorting
-
-    val : 1d array of floats
-        A particle data array
-
-    buf : 1d array of floats
-        A buffer array to temporarily store the
-        sorted particle data array
-    """
-    i = cuda.grid(1)
-    if i < val.shape[0]:
-        buf[i] = val[sorted_idx[i]]
-
-# -----------------------------------------------------
-# Device array creation utility (will be removed later)
-# -----------------------------------------------------
-
-def cuda_deposition_arrays(Nz = None, Nr = None, fieldtype = None):
-    """
-    Create empty arrays on the GPU for the charge and
-    current deposition in each of the 4 possible direction.
-
-    ###########################################
-    # Needs to be moved to the fields package!
-    ###########################################
-
-    Parameters
-    ----------
-    Nz : int
-        Number of cells in z.
-    Nr : int
-        Number of cells in r.
-
-    fieldtype : string
-        Either 'rho' or 'J'.
-    """
-    # Create empty arrays to store the four different possible
-    # cell directions a particle can deposit to.
-    if fieldtype == 'rho':
-        # Rho - third dimension represents 2 modes
-        rho0 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        rho1 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        rho2 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        rho3 = cuda.device_array(shape = (Nz, Nr, 2), dtype = np.complex128)
-        return rho0, rho1, rho2, rho3
-
-    if fieldtype == 'J':
-        # J - third dimension represents 2 modes
-        # times 3 dimensions (r, t, z)
-        J0 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        J1 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        J2 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        J3 = cuda.device_array(shape = (Nz, Nr, 6), dtype = np.complex128)
-        return J0, J1, J2, J3
diff --git a/fbpic/particles/numba_methods.py b/fbpic/particles/numba_methods.py
deleted file mode 100644
index 17d94a84a..000000000
--- a/fbpic/particles/numba_methods.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the optimized particles methods that use numba on a CPU
-"""
-import numba
-import math
-from scipy.constants import c, e
-
-# -----------------------
-# Particle pusher utility
-# -----------------------
-
-@numba.jit(nopython=True)
-def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
-    """
-    Advance the particles' positions over one half-timestep
-
-    This assumes that the positions (x, y, z) are initially either
-    one half-timestep *behind* the momenta (ux, uy, uz), or at the
-    same timestep as the momenta.
-    """
-    # Half timestep, multiplied by c
-    chdt = c*0.5*dt
-
-    # Particle push
-    for ip in range(Ntot) :
-        x[ip] += chdt * inv_gamma[ip] * ux[ip]
-        y[ip] += chdt * inv_gamma[ip] * uy[ip]
-        z[ip] += chdt * inv_gamma[ip] * uz[ip]
-
-@numba.jit(nopython=True)
-def push_p_numba( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
-    """
-    Advance the particles' momenta, using numba
-    """
-    # Set a few constants
-    econst = q*dt/(m*c)
-    bconst = 0.5*q*dt/m
-
-    # Loop over the particles
-    for ip in range(Ntot) :
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
-
-@numba.jit(nopython=True)
-def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
-    """
-    Advance the particles' momenta, using numba
-    """
-    # Set a few constants
-    prefactor_econst = e*dt/(m*c)
-    prefactor_bconst = 0.5*e*dt/m
-
-    # Loop over the particles
-    for ip in range(Ntot) :
-
-        # For neutral macroparticles, skip this step
-        if ionization_level[ip] == 0:
-            continue
-
-        # Calculate the charge dependent constants
-        econst = prefactor_econst * ionization_level[ip]
-        bconst = prefactor_bconst * ionization_level[ip]
-        # Perform the push
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
-            econst, bconst )
-
-@numba.jit(nopython=True)
-def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
-                Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
-    """
-    Push at single macroparticle, using the Vay pusher
-    """
-    # Get the magnetic rotation vector
-    taux = bconst*Bx
-    tauy = bconst*By
-    tauz = bconst*Bz
-    tau2 = taux**2 + tauy**2 + tauz**2
-
-    # Get the momenta at the half timestep
-    uxp = ux_i + econst*Ex \
-    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
-    uyp = uy_i + econst*Ey \
-    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
-    uzp = uz_i + econst*Ez \
-    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
-    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
-    utau = uxp*taux + uyp*tauy + uzp*tauz
-
-    # Get the new 1./gamma
-    inv_gamma_f = math.sqrt(
-        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
-
-    # Reuse the tau and utau variables to save memory
-    tx = inv_gamma_f*taux
-    ty = inv_gamma_f*tauy
-    tz = inv_gamma_f*tauz
-    ut = inv_gamma_f*utau
-    s = 1./( 1 + tau2*inv_gamma_f**2 )
-
-    # Get the new u
-    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
-    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
-    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
-
-    return( ux_f, uy_f, uz_f, inv_gamma_f )
-
-# -----------------------
-# Field gathering utility
-# -----------------------
-
-@numba.jit(nopython=True)
-def gather_field_numba(exptheta, m, Fgrid, Fptcl,
-                       iz, ir, Sz, Sr, sign_guards):
-    """
-    Perform the weighted sum using numba
-
-    Parameters
-    ----------
-    exptheta : 1darray of complexs
-        (one element per macroparticle)
-        Contains exp(-im theta) for each macroparticle
-
-    m : int
-        Index of the mode.
-        Determines wether a factor 2 should be applied
-
-    Fgrid : 2darray of complexs
-        Contains the fields on the interpolation grid,
-        from which to do the gathering
-
-    Fptcl : 1darray of floats
-        (one element per macroparticle)
-        Contains the fields for each macroparticle
-        Is modified by this function
-
-    iz, ir : 2darray of ints
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the index of the cells that each macroparticle
-        will gather from.
-
-    Sz, Sr: 2darray of floats
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the weight for respective cells from iz and ir,
-        for each macroparticle.
-
-    sign_guards : float
-       The sign (+1 or -1) with which the weight of the guard cells should
-       be added to the 0th cell.
-    """
-    # Get the total number of particles
-    Ntot = len(Fptcl)
-
-    # Loop over the particles
-    for ip in range(Ntot):
-        # Erase the temporary variable
-        F = 0.j
-        # Loop over all the adjacent cells (given by shape order)
-        # Use helper variables `ir_corr` and `Sr_corr`.
-        # This is necessary, because ir and Sr should **not** be modified
-        # **in-place**. (This is because ir and Sr are reused several
-        # times, as we call the present function 3 times, with different
-        # values for sign_guards.)
-        for cell_index_r in range(ir.shape[0]):
-            for cell_index_z in range(iz.shape[0]):
-                # Correct the guard cell index and sign
-                if ir[cell_index_r, ip] < 0:
-                    ir_corr = abs(ir[cell_index_r, ip]) - 1
-                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
-                else:
-                    ir_corr = ir[cell_index_r, ip]
-                    Sr_corr = Sr[cell_index_r, ip]
-                # Gather the field value at the respective grid point
-                F += Sz[cell_index_z, ip] * Sr_corr * \
-                    Fgrid[ iz[cell_index_z, ip], ir_corr]
-
-        # Add the complex phase
-        if m == 0:
-            Fptcl[ip] += (F * exptheta[ip]).real
-        if m > 0:
-            Fptcl[ip] += 2 * (F * exptheta[ip]).real
-
-# -------------------------
-# Charge deposition utility
-# -------------------------
-
-@numba.jit(nopython=True)
-def deposit_field_numba(Fptcl, Fgrid,
-        iz, ir, Sz, Sr, sign_guards):
-    """
-    Perform the deposition using numba
-
-    Parameters
-    ----------
-    Fptcl : 1darray of complexs
-        (one element per macroparticle)
-        Contains the charge or current for each macroparticle (already
-        multiplied by exp(im theta), from which to do the deposition
-
-    Fgrid : 2darray of complexs
-        Contains the fields on the interpolation grid.
-        Is modified by this function
-
-    iz, ir : 2darray of ints
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the index of the cells that each macroparticle
-        will deposit to.
-
-    Sz, Sr: 2darray of floats
-        Arrays of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        Contains the weight for respective cells from iz and ir,
-        for each macroparticle.
-
-    sign_guards : float
-       The sign (+1 or -1) with which the weight of the guard cells should
-       be added to the 0th cell.
-    """
-
-    # Get the total number of particles
-    Ntot = len(Fptcl)
-
-    # Loop over all particles
-    for ip in range(Ntot):
-        # Loop over adjacent cells (given by shape order)
-        # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid
-        # modifying ir and Sr in place. (This is not strictly necessary,
-        # but is just here as a safeguard.)
-        for cell_index_r in range(ir.shape[0]):
-            for cell_index_z in range(iz.shape[0]):
-                # Correct the guard cell index and sign
-                if ir[cell_index_r, ip] < 0:
-                    ir_corr = abs(ir[cell_index_r, ip]) - 1
-                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
-                else:
-                    ir_corr = ir[cell_index_r, ip]
-                    Sr_corr = Sr[cell_index_r, ip]
-                # Deposit field from particle to the respective grid point
-                Fgrid[ iz[cell_index_z, ip], ir_corr ] += \
-                    Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip]
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index f9a15a945..0094e305e 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -9,28 +9,36 @@
 from scipy.constants import c, e
 from .ionization import Ionizer
 from .tracking import ParticleTracker
+import numba
+import math
 
 # Load the utility methods
-from .utility_methods import weights, unalign_angles
-# Load the numba routines
-from .numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba, \
-        gather_field_numba, deposit_field_numba
+from .utilities.utility_methods import weights, unalign_angles
+# Load the numba methods
+from .push.numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba
+from .deposition.numba_methods import deposit_field_numba
+from .gathering.numba_methods import gather_field_numba
+# Load the numba CPU multi-threading methods
+from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
+    push_x_prange
+from .deposition.threading_methods import deposit_rho_prange_linear, \
+    deposit_J_prange_linear #CUBIC tbd
+from .gathering.threading_methods import gather_field_prange_linear, \
+    gather_field_prange_cubic
 
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
+    # Load the CUDA methods
     from fbpic.cuda_utils import cuda, cuda_tpb_bpg_1d, cuda_tpb_bpg_2d
-    from .cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu, \
-        gather_field_gpu_linear, gather_field_gpu_cubic, \
-        write_sorting_buffer, cuda_deposition_arrays, \
+    from .push.cuda_methods import push_p_gpu, push_p_ioniz_gpu, push_x_gpu
+    from .deposition.cuda_methods import deposit_rho_gpu_linear, \
+        deposit_J_gpu_linear, deposit_rho_gpu_cubic, deposit_J_gpu_cubic
+    from .gathering.cuda_methods import gather_field_gpu_linear, \
+        gather_field_gpu_cubic
+    from .utilities.cuda_sorting import write_sorting_buffer, \
         get_cell_idx_per_particle, sort_particles_per_cell, \
         reset_prefix_sum, incl_prefix_sum
-    from .cuda_deposition.cubic import deposit_rho_gpu_cubic, \
-        deposit_J_gpu_cubic
-    from .cuda_deposition.linear import deposit_rho_gpu_linear, \
-        deposit_J_gpu_linear
-    from .cuda_deposition.linear_non_atomic import deposit_rho_gpu, \
-        deposit_J_gpu, add_rho, add_J
 
 class Particles(object) :
     """
@@ -50,7 +58,8 @@ def __init__(self, q, m, n, Npz, zmin, zmax,
                     ux_m=0., uy_m=0., uz_m=0.,
                     ux_th=0., uy_th=0., uz_th=0.,
                     dens_func=None, continuous_injection=True,
-                    use_cuda=False, grid_shape=None, particle_shape='linear' ) :
+                    grid_shape=None, particle_shape='linear',
+                    use_cuda=False, use_threading=True) :
         """
         Initialize a uniform set of particles
 
@@ -100,9 +109,6 @@ def dens_func( z, r ) ...
            Whether to continuously inject the particles,
            in the case of a moving window
 
-        use_cuda : bool, optional
-            Wether to use the GPU or not.
-
         grid_shape: tuple, optional
             Needed when running on the GPU
             The shape of the local grid (including guard cells), i.e.
@@ -111,10 +117,14 @@ def dens_func( z, r ) ...
 
         particle_shape: str, optional
             Set the particle shape for the charge/current deposition.
-            Possible values are 'cubic', 'linear' and 'linear_non_atomic'.
-            While 'cubic' corresponds to third order shapes and 'linear'
-            to first order shapes, 'linear_non_atomic' uses an equivalent
-            deposition scheme to 'linear' which avoids atomics on the GPU.
+            Possible values are 'linear' and 'cubic' for first and third
+            order particle shape factors.
+
+        use_cuda : bool, optional
+            Wether to use the GPU or not.
+
+        use_threading : bool, optional
+            Wether to use multi-threading on the CPU.
         """
         # Register the timestep
         self.dt = dt
@@ -198,6 +208,9 @@ def dens_func( z, r ) ...
             if dens_func is not None :
                 self.w[:] = self.w * dens_func( self.z, r )
 
+        # Register particle shape
+        self.particle_shape = particle_shape
+
         # Allocate arrays and register variables when using CUDA
         if self.use_cuda:
             if grid_shape is None:
@@ -213,9 +226,11 @@ def dens_func( z, r ) ...
                                         dtype=np.int32 )
             # Register boolean that records if the particles are sorted or not
             self.sorted = False
-
-        # Register particle shape
-        self.particle_shape = particle_shape
+        # Register variables when using multithreading
+        self.use_threading = use_threading
+        if self.use_threading == True:
+            # Register number of threads
+            self.nthreads = numba.config.NUMBA_NUM_THREADS
 
     def send_particles_to_gpu( self ):
         """
@@ -423,6 +438,7 @@ def push_p( self ) :
         half-timestep *behind* the positions (x, y, z), and it brings
         them one half-timestep *ahead* of the positions.
         """
+        # GPU (CUDA) version
         if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
@@ -441,7 +457,20 @@ def push_p( self ) :
                     self.Ex, self.Ey, self.Ez,
                     self.Bx, self.By, self.Bz,
                     self.m, self.Ntot, self.dt, self.ionizer.ionization_level )
-        else :
+        # CPU multi-threading version
+        elif self.use_threading:
+            if self.ionizer is None:
+                push_p_prange(self.ux, self.uy, self.uz, self.inv_gamma,
+                    self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz,
+                    self.q, self.m, self.Ntot, self.dt )
+            else:
+                # Ionizable species can have a charge that depends on the
+                # macroparticle, and hence require a different function
+                push_p_ioniz_prange(self.ux, self.uy, self.uz, self.inv_gamma,
+                    self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz,
+                    self.m, self.Ntot, self.dt, self.ionizer.ionization_level )
+        # CPU single-core version
+        else:
             if self.ionizer is None:
                 push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma,
                     self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz,
@@ -461,6 +490,7 @@ def halfpush_x( self ) :
         one half-timestep *behind* the momenta (ux, uy, uz), or at the
         same timestep as the momenta.
         """
+        # GPU (CUDA) version
         if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
@@ -471,7 +501,13 @@ def halfpush_x( self ) :
                 self.inv_gamma, self.dt )
             # The particle array is unsorted after the push in x
             self.sorted = False
-        else :
+        # CPU multi-threading version
+        elif self.use_threading:
+            push_x_prange( self.x, self.y, self.z,
+                self.ux, self.uy, self.uz,
+                self.inv_gamma, self.Ntot, self.dt ) 
+        # CPU single-core version
+        else:
             push_x_numba( self.x, self.y, self.z,
                 self.ux, self.uy, self.uz,
                 self.inv_gamma, self.Ntot, self.dt )
@@ -489,12 +525,24 @@ def gather( self, grid ) :
              (one InterpolationGrid object per azimuthal mode)
              Contains the field values on the interpolation grid
         """
-        if self.use_cuda == True:
+        # GPU (CUDA) version
+        if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_1d, dim_block_1d = cuda_tpb_bpg_1d( self.Ntot )
             # Call the CUDA Kernel for the gathering of E and B Fields
             # for Mode 0 and 1 only.
-            if self.particle_shape == 'cubic':
+            if self.particle_shape == 'linear':
+                gather_field_gpu_linear[dim_grid_1d, dim_block_1d](
+                     self.x, self.y, self.z,
+                     grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                     grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                     grid[0].Er, grid[0].Et, grid[0].Ez,
+                     grid[1].Er, grid[1].Et, grid[1].Ez,
+                     grid[0].Br, grid[0].Bt, grid[0].Bz,
+                     grid[1].Br, grid[1].Bt, grid[1].Bz,
+                     self.Ex, self.Ey, self.Ez,
+                     self.Bx, self.By, self.Bz)
+            elif self.particle_shape == 'cubic':
                 gather_field_gpu_cubic[dim_grid_1d, dim_block_1d](
                      self.x, self.y, self.z,
                      grid[0].invdz, grid[0].zmin, grid[0].Nz,
@@ -506,7 +554,24 @@ def gather( self, grid ) :
                      self.Ex, self.Ey, self.Ez,
                      self.Bx, self.By, self.Bz)
             else:
-                gather_field_gpu_linear[dim_grid_1d, dim_block_1d](
+                raise ValueError("`particle_shape` should be either \
+                                  'linear' or 'cubic' \
+                                   but is `%s`" % self.particle_shape)
+        # CPU multi-threading version
+        elif self.use_threading:
+            if self.particle_shape == 'linear':
+                gather_field_prange_linear(
+                     self.x, self.y, self.z,
+                     grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                     grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                     grid[0].Er, grid[0].Et, grid[0].Ez,
+                     grid[1].Er, grid[1].Et, grid[1].Ez,
+                     grid[0].Br, grid[0].Bt, grid[0].Bz,
+                     grid[1].Br, grid[1].Bt, grid[1].Bz,
+                     self.Ex, self.Ey, self.Ez,
+                     self.Bx, self.By, self.Bz)
+            elif self.particle_shape == 'cubic':
+                gather_field_prange_cubic(
                      self.x, self.y, self.z,
                      grid[0].invdz, grid[0].zmin, grid[0].Nz,
                      grid[0].invdr, grid[0].rmin, grid[0].Nr,
@@ -516,6 +581,11 @@ def gather( self, grid ) :
                      grid[1].Br, grid[1].Bt, grid[1].Bz,
                      self.Ex, self.Ey, self.Ez,
                      self.Bx, self.By, self.Bz)
+            else:
+                raise ValueError("`particle_shape` should be either \
+                                  'linear' or 'cubic' \
+                                   but is `%s`" % self.particle_shape)
+        # CPU single-core version
         else:
             # Preliminary arrays for the cylindrical conversion
             r = np.sqrt( self.x**2 + self.y**2 )
@@ -624,19 +694,13 @@ def deposit( self, fld, fieldtype ) :
         """
         # Shortcut for the list of InterpolationGrid objects
         grid = fld.interp
-
-        if self.use_cuda == True:
+        # GPU (CUDA) version
+        if self.use_cuda:
             # Get the threads per block and the blocks per grid
             dim_grid_2d_flat, dim_block_2d_flat = cuda_tpb_bpg_1d(
                                                     grid[0].Nz*grid[0].Nr )
             dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d(
                                           grid[0].Nz, grid[0].Nr )
-
-            # Create the helper arrays for deposition
-            if self.particle_shape == 'linear_non_atomic':
-                d_F0, d_F1, d_F2, d_F3 = cuda_deposition_arrays(
-                    grid[0].Nz, grid[0].Nr, fieldtype=fieldtype)
-
             # Sort the particles
             if self.sorted is False:
                 self.sort_particles(fld=fld)
@@ -648,17 +712,13 @@ def deposit( self, fld, fieldtype ) :
             # Rho
             if fieldtype == 'rho':
                 # Deposit rho in each of four directions
-                if self.particle_shape == 'linear_non_atomic':
-                    deposit_rho_gpu[dim_grid_2d_flat, dim_block_2d_flat](
+                if self.particle_shape == 'linear':
+                    deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        d_F0, d_F1, d_F2, d_F3,
-                        self.cell_idx, self.prefix_sum)
-                    # Add the four directions together
-                    add_rho[dim_grid_2d, dim_block_2d](
                         grid[0].rho, grid[1].rho,
-                        d_F0, d_F1, d_F2, d_F3)
+                        self.cell_idx, self.prefix_sum)
                 elif self.particle_shape == 'cubic':
                     deposit_rho_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
@@ -666,33 +726,23 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
                         grid[0].rho, grid[1].rho,
                         self.cell_idx, self.prefix_sum)
-                elif self.particle_shape == 'linear':
-                    deposit_rho_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
-                        self.x, self.y, self.z, self.w,
-                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
-                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        grid[0].rho, grid[1].rho,
-                        self.cell_idx, self.prefix_sum)
                 else:
-                    raise ValueError("`particle_shape` should be either 'linear', 'linear_atomic' \
-                                      or 'cubic' but is `%s`" % self.particle_shape)
+                    raise ValueError("`particle_shape` should be either \
+                                      'linear' or 'cubic' \
+                                       but is `%s`" % self.particle_shape)
             # J
             elif fieldtype == 'J':
                 # Deposit J in each of four directions
-                if self.particle_shape == 'linear_non_atomic':
-                    deposit_J_gpu[dim_grid_2d_flat, dim_block_2d_flat](
+                if self.particle_shape == 'linear':
+                    deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
                         self.ux, self.uy, self.uz, self.inv_gamma,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        d_F0, d_F1, d_F2, d_F3,
-                        self.cell_idx, self.prefix_sum)
-                    # Add the four directions together
-                    add_J[dim_grid_2d, dim_block_2d](
                         grid[0].Jr, grid[1].Jr,
                         grid[0].Jt, grid[1].Jt,
                         grid[0].Jz, grid[1].Jz,
-                        d_F0, d_F1, d_F2, d_F3)
+                        self.cell_idx, self.prefix_sum)
                 elif self.particle_shape == 'cubic':
                     deposit_J_gpu_cubic[dim_grid_2d_flat, dim_block_2d_flat](
                         self.x, self.y, self.z, self.w,
@@ -703,26 +753,117 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].Jt, grid[1].Jt,
                         grid[0].Jz, grid[1].Jz,
                         self.cell_idx, self.prefix_sum)
-                elif self.particle_shape == 'linear':
-                    deposit_J_gpu_linear[dim_grid_2d_flat, dim_block_2d_flat](
+                else:
+                    raise ValueError("`particle_shape` should be either \
+                                      'linear' or 'cubic' \
+                                       but is `%s`" % self.particle_shape)
+            else:
+                raise ValueError("`fieldtype` should be either 'J' or \
+                                  'rho', but is `%s`" % fieldtype)
+        # CPU multi-threading version
+        elif self.use_threading:
+            # Register particle chunk size for each thread
+            tx_N = int(self.Ntot/self.nthreads) 
+            tx_chunks = [ tx_N for k in range(self.nthreads) ]
+            tx_chunks[-1] = tx_chunks[-1] + (tx_N)%(self.nthreads)
+            # Multithreading functions for the deposition of rho or J
+            # for Mode 0 and 1 only.
+            if fieldtype == 'rho':
+                # Generate temporary arrays for rho
+                rho_m0_global = np.zeros(
+                    (grid[0].rho.shape[0], grid[0].rho.shape[1], self.nthreads), 
+                    dtype=grid[0].rho.dtype )
+                rho_m1_global = np.zeros(
+                    (grid[1].rho.shape[0], grid[1].rho.shape[1], self.nthreads), 
+                    dtype=grid[1].rho.dtype )
+                # Deposit rho using CPU threading
+                if self.particle_shape == 'linear':
+                    deposit_rho_prange_linear(
+                        self.x, self.y, self.z, self.w,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        rho_m0_global, rho_m1_global,
+                        grid[0].rho, grid[1].rho,
+                        self.nthreads, tx_chunks, tx_N )
+                elif self.particle_shape == 'cubic':
+                    deposit_rho_prange_cubic(
+                        self.x, self.y, self.z, self.w,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        rho_m0_global, rho_m1_global,
+                        grid[0].rho, grid[1].rho,
+                        self.nthreads, tx_chunks, tx_N )
+                else:
+                    raise ValueError("`particle_shape` should be either \
+                                      'linear' or 'cubic' \
+                                       but is `%s`" % self.particle_shape)
+                # Sum thread-local results to main field array
+                grid[0].rho = np.sum(rho_m0_global, axis=2)
+                grid[1].rho = np.sum(rho_m1_global, axis=2)
+
+            elif fieldtype == 'J':
+                # Generate temporary arrays for J
+                Jr_m0_global = np.zeros(
+                    (grid[0].Jr.shape[0], grid[0].Jr.shape[1], self.nthreads), 
+                    dtype=grid[0].Jr.dtype )
+                Jt_m0_global = np.zeros(
+                    (grid[0].Jt.shape[0], grid[0].Jt.shape[1], self.nthreads), 
+                    dtype=grid[0].Jt.dtype )
+                Jz_m0_global = np.zeros(
+                    (grid[0].Jz.shape[0], grid[0].Jz.shape[1], self.nthreads), 
+                    dtype=grid[0].Jz.dtype )
+                Jr_m1_global = np.zeros(
+                    (grid[1].Jr.shape[0], grid[1].Jr.shape[1], self.nthreads), 
+                    dtype=grid[1].Jr.dtype )
+                Jt_m1_global = np.zeros(
+                    (grid[1].Jt.shape[0], grid[1].Jt.shape[1], self.nthreads), 
+                    dtype=grid[1].Jt.dtype )
+                Jz_m1_global = np.zeros(
+                    (grid[1].Jz.shape[0], grid[1].Jz.shape[1], self.nthreads), 
+                    dtype=grid[1].Jz.dtype )
+                # Deposit J using CPU threading
+                if self.particle_shape == 'linear':
+                    deposit_J_prange_linear(
                         self.x, self.y, self.z, self.w,
                         self.ux, self.uy, self.uz, self.inv_gamma,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        Jr_m0_global, Jr_m1_global,
+                        Jt_m0_global, Jt_m1_global,
+                        Jz_m0_global, Jz_m1_global,
                         grid[0].Jr, grid[1].Jr,
                         grid[0].Jt, grid[1].Jt,
                         grid[0].Jz, grid[1].Jz,
-                        self.cell_idx, self.prefix_sum)
+                        self.nthreads, tx_chunks, tx_N )
+                elif self.particle_shape == 'cubic':
+                    deposit_J_prange_cubic(
+                        self.x, self.y, self.z, self.w,
+                        self.ux, self.uy, self.uz, self.inv_gamma,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        Jr_m0_global, Jr_m1_global,
+                        Jt_m0_global, Jt_m1_global,
+                        Jz_m0_global, Jz_m1_global,
+                        grid[0].Jr, grid[1].Jr,
+                        grid[0].Jt, grid[1].Jt,
+                        grid[0].Jz, grid[1].Jz,
+                        self.nthreads, tx_chunks, tx_N )
                 else:
                     raise ValueError("`particle_shape` should be either \
-                                      'linear', 'linear_atomic' or 'cubic' \
+                                      'linear' or 'cubic' \
                                        but is `%s`" % self.particle_shape)
+                # Sum thread-local results to main field array
+                grid[0].Jr = np.sum(Jr_m0_global, axis=2)
+                grid[0].Jt = np.sum(Jt_m0_global, axis=2)
+                grid[0].Jz = np.sum(Jz_m0_global, axis=2)
+                grid[1].Jr = np.sum(Jr_m1_global, axis=2)
+                grid[1].Jt = np.sum(Jt_m1_global, axis=2)
+                grid[1].Jz = np.sum(Jz_m1_global, axis=2)
+
             else:
                 raise ValueError("`fieldtype` should be either 'J' or \
                                   'rho', but is `%s`" % fieldtype)
-
-
-        # CPU version
+        # CPU single-core version
         else:
             # Preliminary arrays for the cylindrical conversion
             r = np.sqrt( self.x**2 + self.y**2 )
@@ -745,7 +886,7 @@ def deposit( self, fld, fieldtype ) :
             # number of elements in the grid list
             Nm = len(grid)
 
-            if fieldtype == 'rho' :
+            if fieldtype == 'rho':
                 # ---------------------------------------
                 # Deposit the charge density mode by mode
                 # ---------------------------------------
@@ -765,7 +906,7 @@ def deposit( self, fld, fieldtype ) :
                     deposit_field_numba(self.w*exptheta, grid[m].rho,
                                             iz, ir, Sz, Sr, -1.)
 
-            elif fieldtype == 'J' :
+            elif fieldtype == 'J':
                 # ----------------------------------------
                 # Deposit the current density mode by mode
                 # ----------------------------------------
@@ -793,9 +934,9 @@ def deposit( self, fld, fieldtype ) :
                     deposit_field_numba(Jz*exptheta, grid[m].Jz,
                                         iz, ir, Sz, Sr, -1.)
 
-            else :
-                raise ValueError(
-        "`fieldtype` should be either 'J' or 'rho', but is `%s`" %fieldtype )
+            else:
+                raise ValueError("`fieldtype` should be either 'J' or \
+                                  'rho', but is `%s`" % fieldtype)
 
     def sort_particles(self, fld):
         """
diff --git a/fbpic/particles/utility_methods.py b/fbpic/particles/utility_methods.py
deleted file mode 100644
index d0de6c5b8..000000000
--- a/fbpic/particles/utility_methods.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the optimized particles methods that use numba on a CPU
-"""
-import numpy as np
-
-# -----------------------
-# Particle shapes utility
-# -----------------------
-
-def weights(x, invdx, offset, Nx, direction, shape_order):
-    """
-    Return the array of cell indices and corresponding shape factors
-    for current/charge deposition and field gathering
-
-    Parameters:
-    -----------
-    x : 1darray of floats (in meters)
-        Array of particle positions along a given direction
-        (one element per macroparticle)
-
-    invdx : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    offset : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nx : int
-        Number of gridpoints along the considered direction
-
-    direction : string
-        Determines the boundary conditions. Either 'r' or 'z'
-
-    shape_order : int
-        Order of the shape factor.
-        Either 1 or 3
-
-    Returns:
-    --------
-    A tuple containing :
-
-    i: 2darray of ints
-        An array of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        (i.e. the number of elements in the array x)
-        This array contains the indices of the grid cells
-        (along the axis specified by `direction`) where each macroparticle
-        deposits charge/current and gathers field data.
-
-    S: 2darray of floats
-        An array of shape (shape_order+1, Ntot)
-        where Ntot is the number of macroparticles
-        (i.e. the number of elements in the array x)
-        This array contains the shape factors (a.k.a. interpolation weights)
-        that correspond to each of the indices in the array `i`.
-    """
-    # Positions of the particles, in the cell unit
-    x_cell = invdx*(x - offset) - 0.5
-
-    # Initialize empty arrays of the correct size
-    i = np.empty( (shape_order+1, len(x)), dtype=np.int64)
-    S = np.empty( (shape_order+1, len(x)), dtype=np.float64)
-
-    # Indices and shapes
-    if shape_order == 1:
-        i[0,:] = np.floor(x_cell).astype('int')
-        i[1,:] = i[0,:] + 1
-        # Linear weight
-        S[0,:] = i[1,:] - x_cell
-        S[1,:] = x_cell - i[0,:]
-    elif shape_order == 3:
-        i[0,:] = np.floor(x_cell).astype('int') - 1
-        i[1,:] = i[0,:] + 1
-        i[2,:] = i[0,:] + 2
-        i[3,:] = i[0,:] + 3
-        # Cubic Weights
-        S[0,:] = -1./6. * ((x_cell-i[0])-2)**3
-        S[1,:] = 1./6. * (3*((x_cell-i[1])**3) - 6*((x_cell-i[1])**2)+4)
-        S[2,:] = 1./6. * (3*((i[2]-x_cell)**3) - 6*((i[2]-x_cell)**2)+4)
-        S[3,:] = -1./6. * ((i[3]-x_cell)-2)**3
-    else:
-        raise ValueError("shapes other than linear and cubic are not supported yet.")
-
-    # Periodic boundary conditions in z
-    if direction == 'z':
-        # Lower Bound Periodic
-        i = np.where( i < 0, i+Nx, i )
-        # Upper Bound Periodic
-        i = np.where( i > Nx-1, i-Nx, i )
-    # Absorbing boundary condition at the upper r boundary
-    elif direction == 'r':
-        i = np.where(  i > Nx-1, Nx-1, i )
-        # Note: The lower bound index shift for r is done in the gather
-        # and deposit methods because the sign changes.
-        # This avoids using specific guard cells.
-    else:
-        raise ValueError("Unrecognized `direction` : %s" % direction)
-
-    # Return the result
-    return( i, S )
-
-# ----------------------------
-# Angle initialization utility
-# ----------------------------
-
-def unalign_angles( thetap, Npz, Npr, method='irrational' ) :
-    """
-    Shift the angles so that the particles are
-    not all aligned along the arms of a star transversely
-
-    The fact that the particles are all aligned can produce
-    numerical artefacts, especially if the polarization of the laser
-    is aligned with this direction.
-
-    Here, for each position in r and z, we add the *same*
-    shift for all the Nptheta particles that are at this position.
-    (This preserves the fact that certain modes are 0 initially.)
-    How this shift varies from one position to another depends on
-    the method chosen.
-
-    Parameters
-    ----------
-    thetap : 3darray of floats
-        An array of shape (Npr, Npz, Nptheta) containing the angular
-        positions of the particles, and which is modified by this function.
-
-    Npz, Npr : ints
-        The number of macroparticles along the z and r directions
-
-    method : string
-        Either 'random' or 'irrational'
-    """
-    # Determine the angle shift
-    if method == 'random' :
-        angle_shift = 2*np.pi*np.random.rand(Npz, Npr)
-    elif method == 'irrational' :
-        # Subrandom sequence, by adding irrational number (sqrt(2) and sqrt(3))
-        # This ensures that the sequence does not wrap around and induce
-        # correlations
-        shiftr = np.sqrt(2)*np.arange(Npr)
-        shiftz = np.sqrt(3)*np.arange(Npz)
-        angle_shift = 2*np.pi*( shiftz[:,np.newaxis] + shiftr[np.newaxis,:] )
-        angle_shift = np.mod( angle_shift, 2*np.pi )
-    else :
-        raise ValueError(
-      "method must be either 'random' or 'irrational' but is %s" %method )
-
-    # Add the angle shift to thetap
-    # np.newaxis ensures that the angles that are at the same positions
-    # in r and z have the same shift
-    thetap[:,:,:] = thetap[:,:,:] + angle_shift[:,:, np.newaxis]

From 497c55a8ef8d3a1ce076974dea60c2757aaa07f1 Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Fri, 14 Jul 2017 18:53:15 +0200
Subject: [PATCH 02/36] Initial CPU multi-threading implementation (part 2)

---
 fbpic/particles/deposition/__init__.py        |    0
 fbpic/particles/deposition/cuda_methods.py    | 1683 +++++++++++++++++
 fbpic/particles/deposition/numba_methods.py   |   66 +
 .../particles/deposition/threading_methods.py |  454 +++++
 fbpic/particles/gathering/__init__.py         |    0
 fbpic/particles/gathering/cuda_methods.py     |  599 ++++++
 fbpic/particles/gathering/numba_methods.py    |   82 +
 .../particles/gathering/threading_methods.py  |  580 ++++++
 fbpic/particles/push/__init__.py              |    0
 fbpic/particles/push/cuda_methods.py          |  191 ++
 fbpic/particles/push/numba_methods.py         |  110 ++
 fbpic/particles/push/threading_methods.py     |  114 ++
 fbpic/particles/utilities/__init__.py         |    0
 fbpic/particles/utilities/cuda_sorting.py     |  182 ++
 fbpic/particles/utilities/utility_methods.py  |  155 ++
 15 files changed, 4216 insertions(+)
 create mode 100644 fbpic/particles/deposition/__init__.py
 create mode 100644 fbpic/particles/deposition/cuda_methods.py
 create mode 100644 fbpic/particles/deposition/numba_methods.py
 create mode 100644 fbpic/particles/deposition/threading_methods.py
 create mode 100644 fbpic/particles/gathering/__init__.py
 create mode 100644 fbpic/particles/gathering/cuda_methods.py
 create mode 100644 fbpic/particles/gathering/numba_methods.py
 create mode 100644 fbpic/particles/gathering/threading_methods.py
 create mode 100644 fbpic/particles/push/__init__.py
 create mode 100644 fbpic/particles/push/cuda_methods.py
 create mode 100644 fbpic/particles/push/numba_methods.py
 create mode 100644 fbpic/particles/push/threading_methods.py
 create mode 100644 fbpic/particles/utilities/__init__.py
 create mode 100644 fbpic/particles/utilities/cuda_sorting.py
 create mode 100644 fbpic/particles/utilities/utility_methods.py

diff --git a/fbpic/particles/deposition/__init__.py b/fbpic/particles/deposition/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/deposition/cuda_methods.py b/fbpic/particles/deposition/cuda_methods.py
new file mode 100644
index 000000000..2686479bf
--- /dev/null
+++ b/fbpic/particles/deposition/cuda_methods.py
@@ -0,0 +1,1683 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the GPU using CUDA.
+"""
+from numba import cuda, int64
+import math
+from scipy.constants import c
+import numpy as np
+
+# -------------------------------
+# Particle shape Factor functions 
+# -------------------------------
+
+# Linear shapes
+@cuda.jit(device=True, inline=True)
+def z_shape_linear(cell_position, index):
+    iz = int64(math.floor(cell_position))
+    if index == 0:
+        return iz+1.-cell_position
+    if index == 1:
+        return cell_position - iz
+
+@cuda.jit(device=True, inline=True)
+def r_shape_linear(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position))
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(ir+1.-cell_position)
+    if index == 1:
+        return flip_factor*(cell_position - ir)
+
+# Cubic shapes
+@cuda.jit(device=True, inline=True)
+def z_shape_cubic(cell_position, index):
+    iz = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        return (-1./6.)*((cell_position-iz)-2)**3
+    if index == 1:
+        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
+    if index == 2:
+        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
+    if index == 3:
+        return (-1./6.)*(((iz+3)-cell_position)-2)**3
+
+@cuda.jit(device=True, inline=True)
+def r_shape_cubic(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
+    if index == 1:
+        if ir+1 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
+    if index == 2:
+        if ir+2 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
+    if index == 3:
+        if ir+3 < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
+
+# -------------------------------
+# Field deposition - linear - rho
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                int32[:], int32[:])')
+def deposit_rho_gpu_linear(x, y, z, w,
+                           invdz, zmin, Nz,
+                           invdr, rmin, Nr,
+                           rho_m0, rho_m1,
+                           cell_idx, prefix_sum):
+    """
+    Deposition of the charge density rho using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of rho that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 4 variables (one for each possible direction,
+    e.g. upper in z, lower in r) to maintain parallelism while
+    avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0, rho_m1 : 2darrays of complexs
+        The charge density on the interpolation grid for
+        mode 0 and 1. (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i / Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare local field arrays
+        R_m0_00 = 0.
+        R_m0_01 = 0.
+        R_m0_10 = 0.
+        R_m0_11 = 0.
+
+        R_m1_00 = 0. + 0.j
+        R_m1_01 = 0. + 0.j
+        R_m1_10 = 0. + 0.j
+        R_m1_11 = 0. + 0.j
+
+        for j in range(frequency_per_cell):
+            # Get the particle index before the sorting
+            # --------------------------------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Boundary Region Shifts
+            ir_lower = int64(math.floor(r_cell))
+
+            R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
+            R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
+            R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
+            R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            if ir_lower == -1:
+                R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+            else:
+                R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+        # Cell shifts for the simulation boundaries
+        shift_r = 0
+        shift_z = 0
+        if ir_cell+1 > (Nr-1):
+            shift_r = -1
+        if iz_cell+1 > Nz-1:
+            shift_z -= Nz
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_00.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_00.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_00.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell), R_m0_01.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell), R_m1_01.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell), R_m1_01.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell+1 + shift_r), R_m0_10.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell+1 + shift_r), R_m1_10.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell+1 + shift_r), R_m1_10.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m0_11.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), R_m1_11.imag)
+
+
+# -------------------------------
+# Field deposition - linear - J
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:],\
+                int32[:], int32[:])')
+def deposit_J_gpu_linear(x, y, z, w,
+                         ux, uy, uz, inv_gamma,
+                         invdz, zmin, Nz,
+                         invdr, rmin, Nr,
+                         j_r_m0, j_r_m1,
+                         j_t_m0, j_t_m1,
+                         j_z_m0, j_z_m1,
+                         cell_idx, prefix_sum):
+    """
+    Deposition of the current J using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of J that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 4 variables (one for each possible direction,
+    e.g. upper in z, lower in r) to maintain parallelism while
+    avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
+        The current component in each direction (r, t, z)
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i/Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare the local field value for
+        # all possible deposition directions,
+        # depending on the shape order and per mode for r,t and z.
+
+        J_r_m0_00 = 0.
+        J_r_m1_00 = 0. + 0.j
+        J_t_m0_00 = 0.# + 0.j
+        J_t_m1_00 = 0. + 0.j
+        J_z_m0_00 = 0.
+        J_z_m1_00 = 0. + 0.j
+
+        J_r_m0_01 = 0.
+        J_r_m1_01 = 0. + 0.j
+        J_t_m0_01 = 0.
+        J_t_m1_01 = 0. + 0.j
+        J_z_m0_01 = 0.
+        J_z_m1_01 = 0. + 0.j
+
+        J_r_m0_10 = 0.
+        J_r_m1_10 = 0. + 0.j
+        J_t_m0_10 = 0.
+        J_t_m1_10 = 0. + 0.j
+        J_z_m0_10 = 0.
+        J_z_m1_10 = 0. + 0.j
+
+        J_r_m0_11 = 0.
+        J_r_m1_11 = 0. + 0.j
+        J_t_m0_11 = 0.
+        J_t_m1_11 = 0. + 0.j
+        J_z_m0_11 = 0.
+        J_z_m1_11 = 0. + 0.j
+
+
+        # Loop over the number of particles per cell
+        for j in range(frequency_per_cell):
+            # Get the particle index
+            # ----------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Deposit on local copies at respective position
+            ir_lower = int64(math.floor(r_cell))
+
+            J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+            J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+            J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+            J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+            J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+            J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+            J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+            J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+            J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+            J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+            J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+            J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Take into account lower r flips
+            if ir_lower == -1:
+                J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+            else:
+                J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+        # Cell shifts for the simulation boundaries
+        shift_r = 0
+        shift_z = 0
+        if (ir_cell+1) > (Nr-1):
+            shift_r = -1
+        if (iz_cell+1) > Nz-1:
+            shift_z -= Nz
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_00.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_00.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_00.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell), J_r_m0_01.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_r_m1_01.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell+1 + shift_r), J_r_m0_10.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_r_m1_10.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m0_11.real)
+            cuda.atomic.add(j_r_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_r_m1_11.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_00.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_00.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_00.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell), J_t_m0_01.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_t_m1_01.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell+1 + shift_r), J_t_m0_10.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_t_m1_10.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m0_11.real)
+            cuda.atomic.add(j_t_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_t_m1_11.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_00.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_00.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_00.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell), J_z_m0_01.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell), J_z_m1_01.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell+1 + shift_r), J_z_m0_10.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell+1 + shift_r), J_z_m1_10.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m0_11.real)
+            cuda.atomic.add(j_z_m1.real, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell+1 + shift_z, ir_cell+1 + shift_r), J_z_m1_11.imag)
+
+# -------------------------------
+# Field deposition - cubic - rho
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                int32[:], int32[:])')
+def deposit_rho_gpu_cubic(x, y, z, w,
+                          invdz, zmin, Nz,
+                          invdr, rmin, Nr,
+                          rho_m0, rho_m1,
+                          cell_idx, prefix_sum):
+    """
+    Deposition of the charge density rho using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of rho that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 16 variables (one for each surrounding cell) to
+    maintain parallelism while avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0, rho_m1 : 2darrays of complexs
+        The charge density on the interpolation grid for
+        mode 0 and 1. (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i / Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i - 1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare local field arrays
+        R_m0_00 = 0.
+        R_m1_00 = 0. + 0.j
+
+        R_m0_01 = 0.
+        R_m1_01 = 0. + 0.j
+
+        R_m0_02 = 0.
+        R_m1_02 = 0. + 0.j
+
+        R_m0_03 = 0.
+        R_m1_03 = 0. + 0.j
+
+        R_m0_10 = 0.
+        R_m1_10 = 0. + 0.j
+
+        R_m0_11 = 0.
+        R_m1_11 = 0. + 0.j
+
+        R_m0_12 = 0.
+        R_m1_12 = 0. + 0.j
+
+        R_m0_13 = 0.
+        R_m1_13 = 0. + 0.j
+
+        R_m0_20 = 0.
+        R_m1_20 = 0. + 0.j
+
+        R_m0_21 = 0.
+        R_m1_21 = 0. + 0.j
+
+        R_m0_22 = 0.
+        R_m1_22 = 0. + 0.j
+
+        R_m0_23 = 0.
+        R_m1_23 = 0. + 0.j
+
+        R_m0_30 = 0.
+        R_m1_30 = 0. + 0.j
+
+        R_m0_31 = 0.
+        R_m1_31 = 0. + 0.j
+
+        R_m0_32 = 0.
+        R_m1_32 = 0. + 0.j
+
+        R_m0_33 = 0.
+        R_m1_33 = 0. + 0.j
+
+        for j in range(frequency_per_cell):
+            # Get the particle index before the sorting
+            # --------------------------------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+            # Compute values in local copies and consider boundaries
+            ir0 = int64(math.floor(r_cell)) - 1
+
+            if (ir0 == -2):
+                R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            if (ir0 == -1):
+                R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+            if (ir0 >= 0):
+                R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+        # Index Shifting since local copies are centered around
+        # the current cell
+        srl = 0         # shift r lower
+        sru = 0         # shift r upper inner
+        sru2 = 0        # shift r upper outer
+        szl = 0         # shift z lower
+        szu = 0         # shift z upper inner
+        szu2 = 0        # shift z upper outer
+        if (iz_cell-1) < 0:
+            szl += Nz
+        if (iz_cell) == (Nz - 1):
+            szu -= Nz
+            szu2 -= Nz
+        if (iz_cell+1) == (Nz - 1):
+            szu2 -= Nz
+        if (ir_cell) >= (Nr - 1):
+            sru = -1
+            sru2 = -2
+        if (ir_cell+1) == (Nr - 1):
+            sru2 = -1
+        if (ir_cell-1) < 0:
+            srl = 1
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m0_00.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), R_m1_00.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell - 1 + srl), R_m0_01.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell - 1 + srl), R_m1_01.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell - 1 + srl), R_m1_01.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m0_02.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), R_m1_02.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m0_03.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), R_m1_03.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell ), R_m0_10.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell ), R_m1_10.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell ), R_m1_10.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell), R_m0_11.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell), R_m1_11.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell), R_m1_11.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell), R_m0_12.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell), R_m1_12.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell), R_m1_12.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell), R_m0_13.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell), R_m1_13.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell), R_m1_13.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m0_20.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), R_m1_20.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 1 + sru), R_m0_21.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 1 + sru), R_m1_21.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 1 + sru), R_m1_21.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m0_22.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), R_m1_22.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m0_23.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), R_m1_23.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m0_30.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), R_m1_30.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell, ir_cell + 2 + sru2), R_m0_31.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell, ir_cell + 2 + sru2), R_m1_31.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell, ir_cell + 2 + sru2), R_m1_31.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m0_32.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), R_m1_32.imag)
+
+            cuda.atomic.add(rho_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m0_33.real)
+            cuda.atomic.add(rho_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.real)
+            cuda.atomic.add(rho_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), R_m1_33.imag)
+
+
+# -------------------------------
+# Field deposition - cubic - J
+# -------------------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+                float64[:], float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32, \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:], \
+                complex128[:,:], complex128[:,:],\
+                int32[:], int32[:])')
+def deposit_J_gpu_cubic(x, y, z, w,
+                        ux, uy, uz, inv_gamma,
+                        invdz, zmin, Nz,
+                        invdr, rmin, Nr,
+                        j_r_m0, j_r_m1,
+                        j_t_m0, j_t_m1,
+                        j_z_m0, j_z_m1,
+                        cell_idx, prefix_sum):
+    """
+    Deposition of the current J using numba on the GPU.
+    Iterates over the cells and over the particles per cell.
+    Calculates the weighted amount of J that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The particles are sorted by their cell index (the lower cell
+    in r and z that they deposit to) and the deposited field
+    is split into 16 variables (one for each cell) to maintain
+    parallelism while avoiding any race conditions.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
+        The current component in each direction (r, t, z)
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    if i < prefix_sum.shape[0]:
+        # Calculate the cell index in 2D from the 1D threadIdx
+        iz_cell = int(i/Nr)
+        ir_cell = int(i - iz_cell * Nr)
+        # Calculate the inclusive offset for the current cell
+        # It represents the number of particles contained in all other cells
+        # with an index smaller than i + the total number of particles in the
+        # current cell (inclusive).
+        incl_offset = np.int32(prefix_sum[i])
+        # Calculate the frequency per cell from the offset and the previous
+        # offset (prefix_sum[i-1]).
+        if i > 0:
+            frequency_per_cell = np.int32(incl_offset - prefix_sum[i-1])
+        if i == 0:
+            frequency_per_cell = np.int32(incl_offset)
+
+        # Declare the local field value for
+        # all possible deposition directions,
+        # depending on the shape order and per mode for r,t and z.
+        J_r_m0_00 = 0.
+        J_t_m0_00 = 0.
+        J_z_m0_00 = 0.
+        J_r_m1_00 = 0. + 0.j
+        J_t_m1_00 = 0. + 0.j
+        J_z_m1_00 = 0. + 0.j
+
+        J_r_m0_01 = 0.
+        J_t_m0_01 = 0.
+        J_z_m0_01 = 0.
+        J_r_m1_01 = 0. + 0.j
+        J_t_m1_01 = 0. + 0.j
+        J_z_m1_01 = 0. + 0.j
+
+        J_r_m0_02 = 0.
+        J_t_m0_02 = 0.
+        J_z_m0_02 = 0.
+        J_r_m1_02 = 0. + 0.j
+        J_t_m1_02 = 0. + 0.j
+        J_z_m1_02 = 0. + 0.j
+
+        J_r_m0_03 = 0.
+        J_t_m0_03 = 0.
+        J_z_m0_03 = 0.
+        J_r_m1_03 = 0. + 0.j
+        J_t_m1_03 = 0. + 0.j
+        J_z_m1_03 = 0. + 0.j
+
+        J_r_m0_10 = 0.
+        J_t_m0_10 = 0.
+        J_z_m0_10 = 0.
+        J_r_m1_10 = 0. + 0.j
+        J_t_m1_10 = 0. + 0.j
+        J_z_m1_10 = 0. + 0.j
+
+        J_r_m0_11 = 0.
+        J_t_m0_11 = 0.
+        J_z_m0_11 = 0.
+        J_r_m1_11 = 0. + 0.j
+        J_t_m1_11 = 0. + 0.j
+        J_z_m1_11 = 0. + 0.j
+
+        J_r_m0_12 = 0.
+        J_t_m0_12 = 0.
+        J_z_m0_12 = 0.
+        J_r_m1_12 = 0. + 0.j
+        J_t_m1_12 = 0. + 0.j
+        J_z_m1_12 = 0. + 0.j
+
+        J_r_m0_13 = 0.
+        J_t_m0_13 = 0.
+        J_z_m0_13 = 0.
+        J_r_m1_13 = 0. + 0.j
+        J_t_m1_13 = 0. + 0.j
+        J_z_m1_13 = 0. + 0.j
+
+        J_r_m0_20 = 0.
+        J_t_m0_20 = 0.
+        J_z_m0_20 = 0.
+        J_r_m1_20 = 0. + 0.j
+        J_t_m1_20 = 0. + 0.j
+        J_z_m1_20 = 0. + 0.j
+
+        J_r_m0_21 = 0.
+        J_t_m0_21 = 0.
+        J_z_m0_21 = 0.
+        J_r_m1_21 = 0. + 0.j
+        J_t_m1_21 = 0. + 0.j
+        J_z_m1_21 = 0. + 0.j
+
+        J_r_m0_22 = 0.
+        J_t_m0_22 = 0.
+        J_z_m0_22 = 0.
+        J_r_m1_22 = 0. + 0.j
+        J_t_m1_22 = 0. + 0.j
+        J_z_m1_22 = 0. + 0.j
+
+        J_r_m0_23 = 0.
+        J_t_m0_23 = 0.
+        J_z_m0_23 = 0.
+        J_r_m1_23 = 0. + 0.j
+        J_t_m1_23 = 0. + 0.j
+        J_z_m1_23 = 0. + 0.j
+
+        J_r_m0_30 = 0.
+        J_t_m0_30 = 0.
+        J_z_m0_30 = 0.
+        J_r_m1_30 = 0. + 0.j
+        J_t_m1_30 = 0. + 0.j
+        J_z_m1_30 = 0. + 0.j
+
+        J_r_m0_31 = 0.
+        J_t_m0_31 = 0.
+        J_z_m0_31 = 0.
+        J_r_m1_31 = 0. + 0.j
+        J_t_m1_31 = 0. + 0.j
+        J_z_m1_31 = 0. + 0.j
+
+        J_r_m0_32 = 0.
+        J_t_m0_32 = 0.
+        J_z_m0_32 = 0.
+        J_r_m1_32 = 0. + 0.j
+        J_t_m1_32 = 0. + 0.j
+        J_z_m1_32 = 0. + 0.j
+
+        J_r_m0_33 = 0.
+        J_t_m0_33 = 0.
+        J_z_m0_33 = 0.
+        J_r_m1_33 = 0. + 0.j
+        J_t_m1_33 = 0. + 0.j
+        J_z_m1_33 = 0. + 0.j
+
+        # Loop over the number of particles per cell
+        for j in range(frequency_per_cell):
+            # Get the particle index
+            # ----------------------
+            # (Since incl_offset is a cumulative sum of particle number,
+            # and since python index starts at 0, one has to add -1)
+            ptcl_idx = incl_offset-1-j
+
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Compute values in local copies and consider boundaries
+            ir0 = int64(math.floor(r_cell)) - 1
+
+            if (ir0 == -2):
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+
+                J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir0 == -1):
+                J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir0 >= 0):
+                J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+        # Index Shifting since local copies are centered around
+        # the current cell
+        srl = 0         # shift r lower
+        sru = 0         # shift r upper inner
+        sru2 = 0        # shift r upper outer
+        szl = 0         # shift z lower
+        szu = 0         # shift z upper inner
+        szu2 = 0        # shift z upper outer
+        if (iz_cell-1) < 0:
+            szl += Nz
+        if (iz_cell) == (Nz - 1):
+            szu -= Nz
+            szu2 -= Nz
+        if (iz_cell+1) == (Nz - 1):
+            szu2 -= Nz
+        if (ir_cell) >= (Nr - 1):
+            sru = -1
+            sru2 = -2
+        if (ir_cell+1) == (Nr - 1):
+            sru2 = -1
+        if (ir_cell-1) < 0:
+            srl = 1
+
+        # Atomically add the registers to global memory
+        if frequency_per_cell > 0:
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m0_00)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_r_m1_00.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell - 1 + srl), J_r_m0_01)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell - 1 + srl), J_r_m1_01.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m0_02)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_r_m1_02.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m0_03)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_r_m1_03.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell ), J_r_m0_10)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_r_m1_10.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell), J_r_m0_11)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell), J_r_m1_11.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell), J_r_m1_11.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell), J_r_m0_12)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell), J_r_m1_12.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell), J_r_m0_13)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_r_m1_13.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m0_20)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_r_m1_20.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 1 + sru), J_r_m0_21)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 1 + sru), J_r_m1_21.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m0_22)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_r_m1_22.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m0_23)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_r_m1_23.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m0_30)
+            cuda.atomic.add(j_r_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_r_m1_30.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell, ir_cell + 2 + sru2), J_r_m0_31)
+            cuda.atomic.add(j_r_m1.real, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_r_m1_31.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m0_32)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_r_m1_32.imag)
+
+            cuda.atomic.add(j_r_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m0_33)
+            cuda.atomic.add(j_r_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.real)
+            cuda.atomic.add(j_r_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_r_m1_33.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m0_00)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_t_m1_00.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell - 1 + srl), J_t_m0_01)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell - 1 + srl), J_t_m1_01.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m0_02)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_t_m1_02.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m0_03)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_t_m1_03.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell ), J_t_m0_10)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_t_m1_10.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell), J_t_m0_11)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell), J_t_m1_11.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell), J_t_m1_11.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell), J_t_m0_12)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell), J_t_m1_12.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell), J_t_m0_13)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_t_m1_13.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m0_20)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_t_m1_20.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 1 + sru), J_t_m0_21)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 1 + sru), J_t_m1_21.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m0_22)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_t_m1_22.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m0_23)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_t_m1_23.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m0_30)
+            cuda.atomic.add(j_t_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_t_m1_30.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell, ir_cell + 2 + sru2), J_t_m0_31)
+            cuda.atomic.add(j_t_m1.real, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_t_m1_31.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m0_32)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_t_m1_32.imag)
+
+            cuda.atomic.add(j_t_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m0_33)
+            cuda.atomic.add(j_t_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.real)
+            cuda.atomic.add(j_t_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_t_m1_33.imag)
+
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m0_00)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell - 1 + srl), J_z_m1_00.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell - 1 + srl), J_z_m0_01)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell - 1 + srl), J_z_m1_01.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m0_02)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell - 1 + srl), J_z_m1_02.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m0_03)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell - 1 + srl), J_z_m1_03.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell ), J_z_m0_10)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell ), J_z_m1_10.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell), J_z_m0_11)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell), J_z_m1_11.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell), J_z_m1_11.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell), J_z_m0_12)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell), J_z_m1_12.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell), J_z_m0_13)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell), J_z_m1_13.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m0_20)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 1 + sru), J_z_m1_20.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 1 + sru), J_z_m0_21)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 1 + sru), J_z_m1_21.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m0_22)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 1 + sru), J_z_m1_22.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m0_23)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 1 + sru), J_z_m1_23.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m0_30)
+            cuda.atomic.add(j_z_m1.real, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell - 1 + szl, ir_cell + 2 + sru2), J_z_m1_30.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell, ir_cell + 2 + sru2), J_z_m0_31)
+            cuda.atomic.add(j_z_m1.real, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell, ir_cell + 2 + sru2), J_z_m1_31.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m0_32)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 1 + szu, ir_cell + 2 + sru2), J_z_m1_32.imag)
+
+            cuda.atomic.add(j_z_m0.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m0_33)
+            cuda.atomic.add(j_z_m1.real, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.real)
+            cuda.atomic.add(j_z_m1.imag, (iz_cell + 2 + szu2, ir_cell + 2 + sru2), J_z_m1_33.imag)
diff --git a/fbpic/particles/deposition/numba_methods.py b/fbpic/particles/deposition/numba_methods.py
new file mode 100644
index 000000000..4ad61f9f9
--- /dev/null
+++ b/fbpic/particles/deposition/numba_methods.py
@@ -0,0 +1,66 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the CPU with numba.
+"""
+import numba
+from scipy.constants import c, e
+
+@numba.njit
+def deposit_field_numba(Fptcl, Fgrid,
+        iz, ir, Sz, Sr, sign_guards):
+    """
+    Perform the deposition using numba
+
+    Parameters
+    ----------
+    Fptcl : 1darray of complexs
+        (one element per macroparticle)
+        Contains the charge or current for each macroparticle (already
+        multiplied by exp(im theta), from which to do the deposition
+
+    Fgrid : 2darray of complexs
+        Contains the fields on the interpolation grid.
+        Is modified by this function
+
+    iz, ir : 2darray of ints
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the index of the cells that each macroparticle
+        will deposit to.
+
+    Sz, Sr: 2darray of floats
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the weight for respective cells from iz and ir,
+        for each macroparticle.
+
+    sign_guards : float
+       The sign (+1 or -1) with which the weight of the guard cells should
+       be added to the 0th cell.
+    """
+
+    # Get the total number of particles
+    Ntot = len(Fptcl)
+
+    # Loop over all particles
+    for ip in range(Ntot):
+        # Loop over adjacent cells (given by shape order)
+        # Use helper variables `ir_corr` and `Sr_corr`, in order to avoid
+        # modifying ir and Sr in place. (This is not strictly necessary,
+        # but is just here as a safeguard.)
+        for cell_index_r in range(ir.shape[0]):
+            for cell_index_z in range(iz.shape[0]):
+                # Correct the guard cell index and sign
+                if ir[cell_index_r, ip] < 0:
+                    ir_corr = abs(ir[cell_index_r, ip]) - 1
+                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
+                else:
+                    ir_corr = ir[cell_index_r, ip]
+                    Sr_corr = Sr[cell_index_r, ip]
+                # Deposit field from particle to the respective grid point
+                Fgrid[ iz[cell_index_z, ip], ir_corr ] += \
+                    Sz[cell_index_z,ip] * Sr_corr * Fptcl[ip]
diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
new file mode 100644
index 000000000..595d6bdb5
--- /dev/null
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -0,0 +1,454 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the CPU with threading.
+"""
+import numba
+from numba import prange, int64
+import math
+from scipy.constants import c
+import numpy as np
+
+# -------------------------------
+# Particle shape Factor functions 
+# -------------------------------
+
+# Linear shapes
+@numba.njit
+def z_shape_linear(cell_position, index):
+    iz = int64(math.floor(cell_position))
+    if index == 0:
+        return iz+1.-cell_position
+    if index == 1:
+        return cell_position - iz
+
+@numba.njit
+def r_shape_linear(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position))
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(ir+1.-cell_position)
+    if index == 1:
+        return flip_factor*(cell_position - ir)
+
+# Cubic shapes
+@numba.njit
+def z_shape_cubic(cell_position, index):
+    iz = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        return (-1./6.)*((cell_position-iz)-2)**3
+    if index == 1:
+        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
+    if index == 2:
+        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
+    if index == 3:
+        return (-1./6.)*(((iz+3)-cell_position)-2)**3
+
+@numba.njit
+def r_shape_cubic(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
+    if index == 1:
+        if ir+1 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
+    if index == 2:
+        if ir+2 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
+    if index == 3:
+        if ir+3 < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
+
+# -------------------------------
+# Field deposition - linear - rho
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_rho_prange_linear(x, y, z, w,
+                           invdz, zmin, Nz,
+                           invdr, rmin, Nr,
+                           rho_m0_global, rho_m1_global,
+                           rho_m0, rho_m1,
+                           nthreads, tx_chunks, tx_N):
+    """
+    Deposition of the charge density rho using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+
+    Calculates the weighted amount of rho that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0, rho_m1 : 2darrays of complexs
+        The charge density on the interpolation grid for
+        mode 0 and 1. (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Create thread_local helper arrays
+        # FIXME! ( instead of using zeros_like, 
+        # it would be nicer to use np.zeros((Nz,Nr)) )
+        rho_m0_thread = np.zeros_like( rho_m0 )
+        rho_m1_thread = np.zeros_like( rho_m1 )
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_N
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Boundary Region Shifts
+            ir_flip = int( math.floor(r_cell) )
+
+            R_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
+            R_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
+            R_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
+            R_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            if ir_flip == -1:
+                R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+            else:
+                R_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            # Cell shifts for the simulation boundaries
+            shift_r = 0
+            shift_z = 0
+            if ir_cell+1 > (Nr-1):
+                shift_r = -1
+            if iz_cell+1 > Nz-1:
+                shift_z -= Nz
+            # Write to thread local arrays
+            rho_m0_thread[iz_cell, ir_cell] += R_m0_00
+            rho_m1_thread[iz_cell, ir_cell] += R_m1_00
+
+            rho_m0_thread[iz_cell+1 + shift_z, ir_cell] += R_m0_01
+            rho_m1_thread[iz_cell+1 + shift_z, ir_cell] += R_m1_01
+
+            rho_m0_thread[iz_cell, ir_cell+1 + shift_r] += R_m0_10
+            rho_m1_thread[iz_cell, ir_cell+1 + shift_r] += R_m1_10
+
+            rho_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
+            rho_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
+
+        # Write thread local deposition arrays to global deposition arrays
+        rho_m0_global[:,:,tx] = rho_m0_thread
+        rho_m1_global[:,:,tx] = rho_m1_thread
+
+    return
+
+# -------------------------------
+# Field deposition - linear - J
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_J_prange_linear(x, y, z, w,
+                         ux, uy, uz, inv_gamma,
+                         invdz, zmin, Nz,
+                         invdr, rmin, Nr,
+                         j_r_m0_global, j_r_m1_global,
+                         j_t_m0_global, j_t_m1_global,
+                         j_z_m0_global, j_z_m1_global,
+                         j_r_m0, j_r_m1,
+                         j_t_m0, j_t_m1,
+                         j_z_m0, j_z_m1,
+                         nthreads, tx_chunks, tx_N):
+    """
+    Deposition of the current density J using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+
+    Calculates the weighted amount of J that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
+        The current component in each direction (r, t, z)
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Create thread_local helper arrays
+        # FIXME! ( instead of using zeros_like, 
+        # it would be nicer to use np.zeros((Nz,Nr)) )
+        j_r_m0_thread = np.zeros_like( j_r_m0 )
+        j_t_m0_thread = np.zeros_like( j_t_m0 )
+        j_z_m0_thread = np.zeros_like( j_z_m0 )
+        j_r_m1_thread = np.zeros_like( j_r_m1 )
+        j_t_m1_thread = np.zeros_like( j_t_m1 )
+        j_z_m1_thread = np.zeros_like( j_z_m1 )
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_N
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Boundary Region Shifts
+            ir_flip = int( math.floor(r_cell) )
+
+            J_r_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+            J_t_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+            J_z_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+            J_r_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+            J_t_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+            J_z_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+            J_r_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+            J_t_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+            J_z_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+            J_r_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+            J_t_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+            J_z_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Take into account lower r flips
+            if ir_flip == -1:
+                J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+            else:
+                J_r_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Cell shifts for the simulation boundaries
+            shift_r = 0
+            shift_z = 0
+            if (ir_cell+1) > (Nr-1):
+                shift_r = -1
+            if (iz_cell+1) > Nz-1:
+                shift_z -= Nz
+
+            j_r_m0_thread[iz_cell, ir_cell] += J_r_m0_00
+            j_r_m1_thread[iz_cell, ir_cell] += J_r_m1_00
+
+            j_r_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
+            j_r_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
+
+            j_r_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
+            j_r_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
+
+            j_r_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
+            j_r_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
+
+            j_t_m0_thread[iz_cell, ir_cell] += J_t_m0_00
+            j_t_m1_thread[iz_cell, ir_cell] += J_t_m1_00
+
+            j_t_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
+            j_t_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
+
+            j_t_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
+            j_t_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
+
+            j_t_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
+            j_t_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
+
+            j_z_m0_thread[iz_cell, ir_cell] += J_z_m0_00
+            j_z_m1_thread[iz_cell, ir_cell] += J_z_m1_00
+
+            j_z_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
+            j_z_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
+
+            j_z_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
+            j_z_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
+
+            j_z_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
+            j_z_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
+
+        # Write thread local deposition arrays to global deposition arrays
+        j_r_m0_global[:,:,tx] = j_r_m0_thread
+        j_t_m0_global[:,:,tx] = j_t_m0_thread
+        j_z_m0_global[:,:,tx] = j_z_m0_thread
+        j_r_m1_global[:,:,tx] = j_r_m1_thread
+        j_t_m1_global[:,:,tx] = j_t_m1_thread
+        j_z_m1_global[:,:,tx] = j_z_m1_thread
+
+    return
\ No newline at end of file
diff --git a/fbpic/particles/gathering/__init__.py b/fbpic/particles/gathering/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/gathering/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py
new file mode 100644
index 000000000..6bd5a6c0c
--- /dev/null
+++ b/fbpic/particles/gathering/cuda_methods.py
@@ -0,0 +1,599 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the field gathering methods linear and cubic order shapes 
+on the GPU using CUDA.
+"""
+from numba import cuda, float64, int64
+import math
+from scipy.constants import c, e
+import numpy as np
+
+# -----------------------
+# Field gathering linear
+# -----------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], \
+            float64, float64, int32, \
+            float64, float64, int32, \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:])')
+def gather_field_gpu_linear(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba on the GPU.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (linear).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel
+    # (for threads < number of particles)
+    if i < x.shape[0]:
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt( xj**2 + yj**2 )
+        if (rj !=0. ) :
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else :
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get linear weights for the deposition
+        # --------------------------------------------
+        # Positions of the particles, in the cell unit
+        r_cell =  invdr*(rj - rmin) - 0.5
+        z_cell =  invdz*(zj - zmin) - 0.5
+        # Original index of the uppper and lower cell
+        ir_lower = int(math.floor( r_cell ))
+        ir_upper = ir_lower + 1
+        iz_lower = int(math.floor( z_cell ))
+        iz_upper = iz_lower + 1
+        # Linear weight
+        Sr_lower = ir_upper - r_cell
+        Sr_upper = r_cell - ir_lower
+        Sz_lower = iz_upper - z_cell
+        Sz_upper = z_cell - iz_lower
+        # Set guard weights to zero
+        Sr_guard = 0.
+
+        # Treat the boundary conditions
+        # --------------------------------------------
+        # guard cells in lower r
+        if ir_lower < 0:
+            Sr_guard = Sr_lower
+            Sr_lower = 0.
+            ir_lower = 0
+        # absorbing in upper r
+        if ir_lower > Nr-1:
+            ir_lower = Nr-1
+        if ir_upper > Nr-1:
+            ir_upper = Nr-1
+        # periodic boundaries in z
+        # lower z boundaries
+        if iz_lower < 0:
+            iz_lower += Nz
+        if iz_upper < 0:
+            iz_upper += Nz
+        # upper z boundaries
+        if iz_lower > Nz-1:
+            iz_lower -= Nz
+        if iz_upper > Nz-1:
+            iz_upper -= Nz
+
+        #Precalculate Shapes
+        S_ll = Sz_lower*Sr_lower
+        S_lu = Sz_lower*Sr_upper
+        S_ul = Sz_upper*Sr_lower
+        S_uu = Sz_upper*Sr_upper
+        S_lg = Sz_lower*Sr_guard
+        S_ug = Sz_upper*Sr_guard
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Ez_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Ez_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Er_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Et_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Er_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Et_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = cos*Fr - sin*Ft
+        Ey[i] = sin*Fr + cos*Ft
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Bz_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Bz_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ]
+
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Br_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Bt_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Br_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Bt_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+# -----------------------
+# Field gathering cubic
+# -----------------------
+
+@cuda.jit('void(float64[:], float64[:], float64[:], \
+            float64, float64, int32, \
+            float64, float64, int32, \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            complex128[:,:], complex128[:,:], complex128[:,:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:])')
+def gather_field_gpu_cubic(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba on the GPU.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (cubic).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+
+    # Get the 1D CUDA grid
+    i = cuda.grid(1)
+    # Deposit the field per cell in parallel
+    # (for threads < number of particles)
+    if i < x.shape[0]:
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt(xj**2 + yj**2)
+        if (rj != 0.):
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else:
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get weights for the deposition
+        # --------------------------------------------
+        # Positions of the particle, in the cell unit
+        r_cell = invdr*(rj - rmin) - 0.5
+        z_cell = invdz*(zj - zmin) - 0.5
+
+        # Calculate the shape factors
+        Sr = cuda.local.array((4,), dtype=float64)
+        ir = cuda.local.array((4,), dtype=int64)
+        ir[0] = int64(math.floor(r_cell)) - 1
+        ir[1] = ir[0] + 1
+        ir[2] = ir[1] + 1
+        ir[3] = ir[2] + 1
+        Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3
+        Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
+        Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
+        Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
+        iz = cuda.local.array((4,), dtype=int64)
+        Sz = cuda.local.array((4,), dtype=float64)
+        iz[0] = int64(math.floor(z_cell)) - 1
+        iz[1] = iz[0] + 1
+        iz[2] = iz[1] + 1
+        iz[3] = iz[2] + 1
+        Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3
+        Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4)
+        Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
+        Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
+        # Lower and upper periodic boundary for z
+        for index_z in range(4):
+            if iz[index_z] < 0:
+                iz[index_z] += Nz
+            if iz[index_z] > Nz - 1:
+                iz[index_z] -= Nz
+        # Lower and upper boundary for r
+        for index_r in range(4):
+            if ir[index_r] < 0:
+                ir[index_r] = abs(ir[index_r])-1
+                Sr[index_r] = (-1.)*Sr[index_r]
+            if ir[index_r] > Nr - 1:
+                ir[index_r] = Nr - 1
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        for index_r in range(4):
+            for index_z in range(4):
+                Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        for index_r in range(4):
+            for index_z in range(4):
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = (cos*Fr - sin*Ft)
+        Ey[i] = (sin*Fr + cos*Ft)
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        for index_r in range(4):
+            for index_z in range(4):
+                Fr_m += Sz[index_z]*Sr[index_r]* \
+                    Br_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]* \
+                    Bt_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+
+        # Add the fields for mode 1
+        for index_r in range(4):
+            for index_z in range(4):
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
\ No newline at end of file
diff --git a/fbpic/particles/gathering/numba_methods.py b/fbpic/particles/gathering/numba_methods.py
new file mode 100644
index 000000000..f85a79ea0
--- /dev/null
+++ b/fbpic/particles/gathering/numba_methods.py
@@ -0,0 +1,82 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the field gathering methods linear and cubic order shapes 
+on the CPU with numba.
+"""
+import numba
+
+@numba.jit(nopython=True)
+def gather_field_numba(exptheta, m, Fgrid, Fptcl,
+                       iz, ir, Sz, Sr, sign_guards):
+    """
+    Perform the weighted sum using numba
+
+    Parameters
+    ----------
+    exptheta : 1darray of complexs
+        (one element per macroparticle)
+        Contains exp(-im theta) for each macroparticle
+
+    m : int
+        Index of the mode.
+        Determines wether a factor 2 should be applied
+
+    Fgrid : 2darray of complexs
+        Contains the fields on the interpolation grid,
+        from which to do the gathering
+
+    Fptcl : 1darray of floats
+        (one element per macroparticle)
+        Contains the fields for each macroparticle
+        Is modified by this function
+
+    iz, ir : 2darray of ints
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the index of the cells that each macroparticle
+        will gather from.
+
+    Sz, Sr: 2darray of floats
+        Arrays of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        Contains the weight for respective cells from iz and ir,
+        for each macroparticle.
+
+    sign_guards : float
+       The sign (+1 or -1) with which the weight of the guard cells should
+       be added to the 0th cell.
+    """
+    # Get the total number of particles
+    Ntot = len(Fptcl)
+
+    # Loop over the particles
+    for ip in range(Ntot):
+        # Erase the temporary variable
+        F = 0.j
+        # Loop over all the adjacent cells (given by shape order)
+        # Use helper variables `ir_corr` and `Sr_corr`.
+        # This is necessary, because ir and Sr should **not** be modified
+        # **in-place**. (This is because ir and Sr are reused several
+        # times, as we call the present function 3 times, with different
+        # values for sign_guards.)
+        for cell_index_r in range(ir.shape[0]):
+            for cell_index_z in range(iz.shape[0]):
+                # Correct the guard cell index and sign
+                if ir[cell_index_r, ip] < 0:
+                    ir_corr = abs(ir[cell_index_r, ip]) - 1
+                    Sr_corr = sign_guards * Sr[cell_index_r, ip]
+                else:
+                    ir_corr = ir[cell_index_r, ip]
+                    Sr_corr = Sr[cell_index_r, ip]
+                # Gather the field value at the respective grid point
+                F += Sz[cell_index_z, ip] * Sr_corr * \
+                    Fgrid[ iz[cell_index_z, ip], ir_corr]
+
+        # Add the complex phase
+        if m == 0:
+            Fptcl[ip] += (F * exptheta[ip]).real
+        if m > 0:
+            Fptcl[ip] += 2 * (F * exptheta[ip]).real
\ No newline at end of file
diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py
new file mode 100644
index 000000000..73da99cfe
--- /dev/null
+++ b/fbpic/particles/gathering/threading_methods.py
@@ -0,0 +1,580 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the field gathering methods linear and cubic order shapes 
+on the CPU with threading
+"""
+from numba import prange, int64
+import numba
+import math
+
+# -----------------------
+# Field gathering linear
+# -----------------------
+
+@numba.njit(parallel=True)
+def gather_field_prange_linear(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba with multi-threading.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (linear).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Deposit the field per cell in parallel
+    for i in prange(x.shape[0]):
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt( xj**2 + yj**2 )
+        if (rj !=0. ) :
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else :
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get linear weights for the deposition
+        # --------------------------------------------
+        # Positions of the particles, in the cell unit
+        r_cell =  invdr*(rj - rmin) - 0.5
+        z_cell =  invdz*(zj - zmin) - 0.5
+        # Original index of the uppper and lower cell
+        ir_lower = int(math.floor( r_cell ))
+        ir_upper = ir_lower + 1
+        iz_lower = int(math.floor( z_cell ))
+        iz_upper = iz_lower + 1
+        # Linear weight
+        Sr_lower = ir_upper - r_cell
+        Sr_upper = r_cell - ir_lower
+        Sz_lower = iz_upper - z_cell
+        Sz_upper = z_cell - iz_lower
+        # Set guard weights to zero
+        Sr_guard = 0.
+
+        # Treat the boundary conditions
+        # --------------------------------------------
+        # guard cells in lower r
+        if ir_lower < 0:
+            Sr_guard = Sr_lower
+            Sr_lower = 0.
+            ir_lower = 0
+        # absorbing in upper r
+        if ir_lower > Nr-1:
+            ir_lower = Nr-1
+        if ir_upper > Nr-1:
+            ir_upper = Nr-1
+        # periodic boundaries in z
+        # lower z boundaries
+        if iz_lower < 0:
+            iz_lower += Nz
+        if iz_upper < 0:
+            iz_upper += Nz
+        # upper z boundaries
+        if iz_lower > Nz-1:
+            iz_lower -= Nz
+        if iz_upper > Nz-1:
+            iz_upper -= Nz
+
+        #Precalculate Shapes
+        S_ll = Sz_lower*Sr_lower
+        S_lu = Sz_lower*Sr_upper
+        S_ul = Sz_upper*Sr_lower
+        S_uu = Sz_upper*Sr_upper
+        S_lg = Sz_lower*Sr_guard
+        S_ug = Sz_upper*Sr_guard
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Ez_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Ez_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Er_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Et_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Er_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Et_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = cos*Fr - sin*Ft
+        Ey[i] = sin*Fr + cos*Ft
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Bz_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Bz_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ]
+
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Br_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Bt_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Br_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Bt_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+    return Ex, Ey, Ez, Bx, By, Bz
+
+# -----------------------
+# Field gathering cubic
+# -----------------------
+
+@numba.njit(parallel=True)
+def gather_field_prange_cubic(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba with multi-threading.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (cubic).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Deposit the field per cell in parallel
+    # (for threads < number of particles)
+    for i in prange(x.shape[0]):
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt(xj**2 + yj**2)
+        if (rj != 0.):
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else:
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get weights for the deposition
+        # --------------------------------------------
+        # Positions of the particle, in the cell unit
+        r_cell = invdr*(rj - rmin) - 0.5
+        z_cell = invdz*(zj - zmin) - 0.5
+
+        # Calculate the shape factors
+        Sr = [0.,0.,0.,0.]
+        ir = [0,0,0,0]
+        ir[0] = int64(math.floor(r_cell)) - 1
+        ir[1] = ir[0] + 1
+        ir[2] = ir[1] + 1
+        ir[3] = ir[2] + 1
+        Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3
+        Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
+        Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
+        Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
+        iz = [0.,0.,0.,0.]
+        Sz = [0,0,0,0]
+        iz[0] = int64(math.floor(z_cell)) - 1
+        iz[1] = iz[0] + 1
+        iz[2] = iz[1] + 1
+        iz[3] = iz[2] + 1
+        Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3
+        Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4)
+        Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
+        Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
+        # Lower and upper periodic boundary for z
+        for index_z in range(4):
+            if iz[index_z] < 0:
+                iz[index_z] += Nz
+            if iz[index_z] > Nz - 1:
+                iz[index_z] -= Nz
+        # Lower and upper boundary for r
+        for index_r in range(4):
+            if ir[index_r] < 0:
+                ir[index_r] = abs(ir[index_r])-1
+                Sr[index_r] = (-1.)*Sr[index_r]
+            if ir[index_r] > Nr - 1:
+                ir[index_r] = Nr - 1
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        for index_r in range(4):
+            for index_z in range(4):
+                Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        for index_r in range(4):
+            for index_z in range(4):
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = (cos*Fr - sin*Ft)
+        Ey[i] = (sin*Fr + cos*Ft)
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        for index_r in range(4):
+            for index_z in range(4):
+                Fr_m += Sz[index_z]*Sr[index_r]* \
+                    Br_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]* \
+                    Bt_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+
+        # Add the fields for mode 1
+        for index_r in range(4):
+            for index_z in range(4):
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+    return Ex, Ey, Ez, Bx, By, Bz
\ No newline at end of file
diff --git a/fbpic/particles/push/__init__.py b/fbpic/particles/push/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/push/cuda_methods.py b/fbpic/particles/push/cuda_methods.py
new file mode 100644
index 000000000..41d48d12a
--- /dev/null
+++ b/fbpic/particles/push/cuda_methods.py
@@ -0,0 +1,191 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle push methods on the GPU using CUDA.
+"""
+from numba import cuda
+import math
+from scipy.constants import c, e
+
+@cuda.jit('void(float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64)')
+def push_x_gpu( x, y, z, ux, uy, uz, inv_gamma, dt ) :
+    """
+    Advance the particles' positions over one half-timestep
+
+    This assumes that the positions (x, y, z) are initially either
+    one half-timestep *behind* the momenta (ux, uy, uz), or at the
+    same timestep as the momenta.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+        (is modified by this function)
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    dt : float (seconds)
+        The time by which the position is advanced
+    """
+    # Half timestep, multiplied by c
+    chdt = c*0.5*dt
+
+    i = cuda.grid(1)
+    if i < x.shape[0]:
+        # Particle push
+        inv_g = inv_gamma[i]
+        x[i] += chdt*inv_g*ux[i]
+        y[i] += chdt*inv_g*uy[i]
+        z[i] += chdt*inv_g*uz[i]
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64, float64, int32, float64)')
+def push_p_gpu( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz,
+                q, m, Ntot, dt ) :
+    """
+    Advance the particles' momenta, using cuda on the GPU
+
+    Parameters
+    ----------
+    ux, uy, uz : 1darray of floats
+        The velocity of the particles
+        (is modified by this function)
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+
+    q : float
+        The charge of the particle species
+
+    m : float
+        The mass of the particle species
+
+    Ntot : int
+        The total number of particles
+
+    dt : float
+        The time by which the momenta is advanced
+    """
+    # Set a few constants
+    econst = q*dt/(m*c)
+    bconst = 0.5*q*dt/m
+
+    #Cuda 1D grid
+    ip = cuda.grid(1)
+
+    # Loop over the particles
+    if ip < Ntot:
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
+
+@cuda.jit('void(float64[:], float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64[:], float64[:], float64[:], \
+            float64, int32, float64, int16[:])')
+def push_p_ioniz_gpu( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz,
+                m, Ntot, dt, ionization_level ) :
+    """
+    Advance the particles' momenta, using numba on the GPU
+    This take into account that the particles are ionizable, and thus
+    that their charge is determined by `ionization_level`
+
+    Parameters
+    ----------
+    ux, uy, uz : 1darray of floats
+        The velocity of the particles
+        (is modified by this function)
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+
+    m : float
+        The mass of the particle species
+
+    Ntot : int
+        The total number of particles
+
+    dt : float
+        The time by which the momenta is advanced
+
+    ionization_level : 1darray of ints
+        The number of electrons that each ion is missing
+        (compared to a neutral atom)
+    """
+    #Cuda 1D grid
+    ip = cuda.grid(1)
+
+    # Loop over the particles
+    if ip < Ntot:
+        if ionization_level[ip] != 0:
+            # Set a few constants
+            econst = ionization_level[ip] * e * dt/(m*c)
+            bconst = 0.5 * ionization_level[ip] * e * dt/m
+            # Use the Vay pusher
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+                ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+                Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst)
+
+@cuda.jit(device=True, inline=True)
+def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
+    Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
+    """
+    Push at single macroparticle, using the Vay pusher
+    """
+    # Get the magnetic rotation vector
+    taux = bconst*Bx
+    tauy = bconst*By
+    tauz = bconst*Bz
+    tau2 = taux**2 + tauy**2 + tauz**2
+
+    # Get the momenta at the half timestep
+    uxp = ux_i + econst*Ex \
+    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
+    uyp = uy_i + econst*Ey \
+    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
+    uzp = uz_i + econst*Ez \
+    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
+    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
+    utau = uxp*taux + uyp*tauy + uzp*tauz
+
+    # Get the new 1./gamma
+    inv_gamma_f = math.sqrt(
+        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
+
+    # Reuse the tau and utau arrays to save memory
+    tx = inv_gamma_f*taux
+    ty = inv_gamma_f*tauy
+    tz = inv_gamma_f*tauz
+    ut = inv_gamma_f*utau
+    s = 1./( 1 + tau2*inv_gamma_f**2 )
+
+    # Get the new u
+    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
+    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
+    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
+
+    return( ux_f, uy_f, uz_f, inv_gamma_f )
\ No newline at end of file
diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py
new file mode 100644
index 000000000..6e5807085
--- /dev/null
+++ b/fbpic/particles/push/numba_methods.py
@@ -0,0 +1,110 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle push methods on the CPU with numba.
+"""
+import numba
+import math
+from scipy.constants import c, e
+
+@numba.njit
+def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
+    """
+    Advance the particles' positions over one half-timestep
+
+    This assumes that the positions (x, y, z) are initially either
+    one half-timestep *behind* the momenta (ux, uy, uz), or at the
+    same timestep as the momenta.
+    """
+    # Half timestep, multiplied by c
+    chdt = c*0.5*dt
+
+    # Particle push
+    for ip in range(Ntot) :
+        x[ip] += chdt * inv_gamma[ip] * ux[ip]
+        y[ip] += chdt * inv_gamma[ip] * uy[ip]
+        z[ip] += chdt * inv_gamma[ip] * uz[ip]
+
+@numba.njit
+def push_p_numba( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
+    """
+    Advance the particles' momenta, using numba
+    """
+    # Set a few constants
+    econst = q*dt/(m*c)
+    bconst = 0.5*q*dt/m
+
+    # Loop over the particles
+    for ip in range(Ntot) :
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
+
+@numba.njit
+def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
+    """
+    Advance the particles' momenta, using numba
+    """
+    # Set a few constants
+    prefactor_econst = e*dt/(m*c)
+    prefactor_bconst = 0.5*e*dt/m
+
+    # Loop over the particles
+    for ip in range(Ntot) :
+
+        # For neutral macroparticles, skip this step
+        if ionization_level[ip] == 0:
+            continue
+
+        # Calculate the charge dependent constants
+        econst = prefactor_econst * ionization_level[ip]
+        bconst = prefactor_bconst * ionization_level[ip]
+        # Perform the push
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
+            econst, bconst )
+
+@numba.njit
+def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
+                Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
+    """
+    Push at single macroparticle, using the Vay pusher
+    """
+    # Get the magnetic rotation vector
+    taux = bconst*Bx
+    tauy = bconst*By
+    tauz = bconst*Bz
+    tau2 = taux**2 + tauy**2 + tauz**2
+
+    # Get the momenta at the half timestep
+    uxp = ux_i + econst*Ex \
+    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
+    uyp = uy_i + econst*Ey \
+    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
+    uzp = uz_i + econst*Ez \
+    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
+    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
+    utau = uxp*taux + uyp*tauy + uzp*tauz
+
+    # Get the new 1./gamma
+    inv_gamma_f = math.sqrt(
+        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
+
+    # Reuse the tau and utau variables to save memory
+    tx = inv_gamma_f*taux
+    ty = inv_gamma_f*tauy
+    tz = inv_gamma_f*tauz
+    ut = inv_gamma_f*utau
+    s = 1./( 1 + tau2*inv_gamma_f**2 )
+
+    # Get the new u
+    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
+    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
+    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
+
+    return( ux_f, uy_f, uz_f, inv_gamma_f )
\ No newline at end of file
diff --git a/fbpic/particles/push/threading_methods.py b/fbpic/particles/push/threading_methods.py
new file mode 100644
index 000000000..06205dbfc
--- /dev/null
+++ b/fbpic/particles/push/threading_methods.py
@@ -0,0 +1,114 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle push methods on the CPU with threading.
+"""
+import numba
+from numba import prange
+import math
+from scipy.constants import c, e
+
+@numba.njit(parallel=True)
+def push_x_prange( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
+    """
+    Advance the particles' positions over one half-timestep
+
+    This assumes that the positions (x, y, z) are initially either
+    one half-timestep *behind* the momenta (ux, uy, uz), or at the
+    same timestep as the momenta.
+    """
+    # Half timestep, multiplied by c
+    chdt = c*0.5*dt
+
+    # Particle push in parallel
+    for ip in prange(Ntot) :
+        x[ip] += chdt * inv_gamma[ip] * ux[ip]
+        y[ip] += chdt * inv_gamma[ip] * uy[ip]
+        z[ip] += chdt * inv_gamma[ip] * uz[ip]
+    return x, y, z
+
+@numba.njit(parallel=True)
+def push_p_prange( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
+    """
+    Advance the particles' momenta, using numba
+    """
+    # Set a few constants
+    econst = q*dt/(m*c)
+    bconst = 0.5*q*dt/m
+
+    # Loop over the particles in parallel
+    for ip in prange(Ntot) :
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
+    return ux, uy, uz, inv_gamma
+
+@numba.njit(parallel=True)
+def push_p_ioniz_prange( ux, uy, uz, inv_gamma,
+                Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
+    """
+    Advance the particles' momenta, using numba
+    """
+    # Set a few constants
+    prefactor_econst = e*dt/(m*c)
+    prefactor_bconst = 0.5*e*dt/m
+
+    # Loop over the particles in parallel
+    for ip in prange(Ntot) :
+
+        # For neutral macroparticles, skip this step
+        if ionization_level[ip] == 0:
+            continue
+
+        # Calculate the charge dependent constants
+        econst = prefactor_econst * ionization_level[ip]
+        bconst = prefactor_bconst * ionization_level[ip]
+        # Perform the push
+        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
+            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
+            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
+            econst, bconst )
+    return ux, uy, uz, inv_gamma
+
+@numba.njit
+def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
+                Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
+    """
+    Push at single macroparticle, using the Vay pusher
+    """
+    # Get the magnetic rotation vector
+    taux = bconst*Bx
+    tauy = bconst*By
+    tauz = bconst*Bz
+    tau2 = taux**2 + tauy**2 + tauz**2
+
+    # Get the momenta at the half timestep
+    uxp = ux_i + econst*Ex \
+    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
+    uyp = uy_i + econst*Ey \
+    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
+    uzp = uz_i + econst*Ez \
+    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
+    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
+    utau = uxp*taux + uyp*tauy + uzp*tauz
+
+    # Get the new 1./gamma
+    inv_gamma_f = math.sqrt(
+        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
+
+    # Reuse the tau and utau variables to save memory
+    tx = inv_gamma_f*taux
+    ty = inv_gamma_f*tauy
+    tz = inv_gamma_f*tauz
+    ut = inv_gamma_f*utau
+    s = 1./( 1 + tau2*inv_gamma_f**2 )
+
+    # Get the new u
+    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
+    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
+    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
+
+    return( ux_f, uy_f, uz_f, inv_gamma_f )
\ No newline at end of file
diff --git a/fbpic/particles/utilities/__init__.py b/fbpic/particles/utilities/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fbpic/particles/utilities/cuda_sorting.py b/fbpic/particles/utilities/cuda_sorting.py
new file mode 100644
index 000000000..cb0a9a09f
--- /dev/null
+++ b/fbpic/particles/utilities/cuda_sorting.py
@@ -0,0 +1,182 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the particle sorting methods on the GPU using CUDA.
+"""
+from numba import cuda
+from accelerate.cuda import sorting
+
+# -----------------------------------------------------
+# Sorting utilities - get_cell_idx / sort / prefix_sum
+# -----------------------------------------------------
+
+@cuda.jit('void(int32[:], uint32[:], \
+                float64[:], float64[:], float64[:], \
+                float64, float64, int32, \
+                float64, float64, int32)')
+def get_cell_idx_per_particle(cell_idx, sorted_idx,
+                              x, y, z,
+                              invdz, zmin, Nz,
+                              invdr, rmin, Nr):
+    """
+    Get the cell index of each particle.
+    The cell index is 1d and calculated by:
+    cell index in z + cell index in r * number of cells in z.
+    The cell_idx of a particle is defined by
+    the lower cell in r and z, that it deposits its field to.
+
+    Parameters
+    ----------
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    sorted_idx : 1darray of integers
+        The sorted index array needs to be reset
+        before doing the sort
+
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box, in each direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+    """
+    i = cuda.grid(1)
+    if i < cell_idx.shape[0]:
+            # Preliminary arrays for the cylindrical conversion
+            xj = x[i]
+            yj = y[i]
+            zj = z[i]
+            rj = math.sqrt( xj**2 + yj**2 )
+
+            # Positions of the particles, in the cell unit
+            r_cell =  invdr*(rj - rmin) - 0.5
+            z_cell =  invdz*(zj - zmin) - 0.5
+
+            # Original index of the uppper and lower cell
+            ir_lower = int(math.floor( r_cell ))
+            iz_lower = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_lower < 0:
+                ir_lower = 0
+            # absorbing in upper r
+            if ir_lower > Nr-1:
+                ir_lower = Nr-1
+            # periodic boundaries in z
+            if iz_lower < 0:
+                iz_lower += Nz
+            if iz_lower > Nz-1:
+                iz_lower -= Nz
+
+            # Reset sorted_idx array
+            sorted_idx[i] = i
+            # Calculate the 1D cell_idx by cell_idx_ir + cell_idx_iz * Nr
+            cell_idx[i] = ir_lower + iz_lower * Nr
+
+def sort_particles_per_cell(cell_idx, sorted_idx):
+    """
+    Sort the cell index of the particles and
+    modify the sorted index array accordingly.
+
+    Parameters
+    ----------
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    sorted_idx : 1darray of integers
+        Represents the original index of the
+        particle before the sorting.
+    """
+    Ntot = cell_idx.shape[0]
+    if Ntot > 0:
+        sorter = sorting.RadixSort(Ntot, dtype = np.int32)
+        sorter.sort(cell_idx, vals = sorted_idx)
+
+@cuda.jit('void(int32[:], int32[:])')
+def incl_prefix_sum(cell_idx, prefix_sum):
+    """
+    Perform an inclusive parallel prefix sum on the sorted
+    cell index array. The prefix sum array represents the
+    cumulative sum of the number of particles per cell
+    for each cell index.
+
+    Parameters
+    ----------
+    cell_idx : 1darray of integers
+        The cell index of the particle
+
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    # i is the index of the macroparticle
+    i = cuda.grid(1)
+    if i < cell_idx.shape[0]-1:
+        # ci: index of the cell of the present macroparticle
+        ci = cell_idx[i]
+        # ci_next: index of the cell of the next macroparticle
+        ci_next = cell_idx[i+1]
+        # Fill all the cells between ci and ci_next with the
+        # inclusive cumulative sum of the number particles until ci
+        while ci < ci_next:
+            # The cumulative sum of the number of particle per cell
+            # until ci is i+1 (since i obeys python index, starting at 0)
+            prefix_sum[ci] = i+1
+            ci += 1
+    # The last "macroparticle" of the cell_idx array fills up the
+    # rest of the prefix sum array
+    if i == cell_idx.shape[0]-1:
+        # Get the cell_index of the last macroparticle
+        ci = cell_idx[i]
+        # Fill all the remaining entries of the prefix sum array
+        for empty_index in range(ci, prefix_sum.shape[0]):
+            prefix_sum[empty_index] = i+1
+
+@cuda.jit('void(int32[:])')
+def reset_prefix_sum(prefix_sum):
+    """
+    Resets the prefix sum. Sets all the values
+    to zero.
+
+    Parameters
+    ----------
+    prefix_sum : 1darray of integers
+        Represents the cumulative sum of
+        the particles per cell
+    """
+    i = cuda.grid(1)
+    if i < prefix_sum.shape[0]:
+        prefix_sum[i] = 0
+
+@cuda.jit('void(uint32[:], float64[:], float64[:])')
+def write_sorting_buffer(sorted_idx, val, buf):
+    """
+    Writes the values of a particle array to a buffer,
+    while rearranging them to match the sorted cell index array.
+
+    Parameters
+    ----------
+    sorted_idx : 1darray of integers
+        Represents the original index of the
+        particle before the sorting
+
+    val : 1d array of floats
+        A particle data array
+
+    buf : 1d array of floats
+        A buffer array to temporarily store the
+        sorted particle data array
+    """
+    i = cuda.grid(1)
+    if i < val.shape[0]:
+        buf[i] = val[sorted_idx[i]]
diff --git a/fbpic/particles/utilities/utility_methods.py b/fbpic/particles/utilities/utility_methods.py
new file mode 100644
index 000000000..d79019f0d
--- /dev/null
+++ b/fbpic/particles/utilities/utility_methods.py
@@ -0,0 +1,155 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines particle utility methods.
+"""
+import numpy as np
+
+# -----------------------
+# Particle shapes utility
+# -----------------------
+
+def weights(x, invdx, offset, Nx, direction, shape_order):
+    """
+    Return the array of cell indices and corresponding shape factors
+    for current/charge deposition and field gathering
+
+    Parameters:
+    -----------
+    x : 1darray of floats (in meters)
+        Array of particle positions along a given direction
+        (one element per macroparticle)
+
+    invdx : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    offset : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nx : int
+        Number of gridpoints along the considered direction
+
+    direction : string
+        Determines the boundary conditions. Either 'r' or 'z'
+
+    shape_order : int
+        Order of the shape factor.
+        Either 1 or 3
+
+    Returns:
+    --------
+    A tuple containing :
+
+    i: 2darray of ints
+        An array of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        (i.e. the number of elements in the array x)
+        This array contains the indices of the grid cells
+        (along the axis specified by `direction`) where each macroparticle
+        deposits charge/current and gathers field data.
+
+    S: 2darray of floats
+        An array of shape (shape_order+1, Ntot)
+        where Ntot is the number of macroparticles
+        (i.e. the number of elements in the array x)
+        This array contains the shape factors (a.k.a. interpolation weights)
+        that correspond to each of the indices in the array `i`.
+    """
+    # Positions of the particles, in the cell unit
+    x_cell = invdx*(x - offset) - 0.5
+
+    # Initialize empty arrays of the correct size
+    i = np.empty( (shape_order+1, len(x)), dtype=np.int64)
+    S = np.empty( (shape_order+1, len(x)), dtype=np.float64)
+
+    # Indices and shapes
+    if shape_order == 1:
+        i[0,:] = np.floor(x_cell).astype('int')
+        i[1,:] = i[0,:] + 1
+        # Linear weight
+        S[0,:] = i[1,:] - x_cell
+        S[1,:] = x_cell - i[0,:]
+    elif shape_order == 3:
+        i[0,:] = np.floor(x_cell).astype('int') - 1
+        i[1,:] = i[0,:] + 1
+        i[2,:] = i[0,:] + 2
+        i[3,:] = i[0,:] + 3
+        # Cubic Weights
+        S[0,:] = -1./6. * ((x_cell-i[0])-2)**3
+        S[1,:] = 1./6. * (3*((x_cell-i[1])**3) - 6*((x_cell-i[1])**2)+4)
+        S[2,:] = 1./6. * (3*((i[2]-x_cell)**3) - 6*((i[2]-x_cell)**2)+4)
+        S[3,:] = -1./6. * ((i[3]-x_cell)-2)**3
+    else:
+        raise ValueError("shapes other than linear and cubic are not supported yet.")
+
+    # Periodic boundary conditions in z
+    if direction == 'z':
+        # Lower Bound Periodic
+        i = np.where( i < 0, i+Nx, i )
+        # Upper Bound Periodic
+        i = np.where( i > Nx-1, i-Nx, i )
+    # Absorbing boundary condition at the upper r boundary
+    elif direction == 'r':
+        i = np.where(  i > Nx-1, Nx-1, i )
+        # Note: The lower bound index shift for r is done in the gather
+        # and deposit methods because the sign changes.
+        # This avoids using specific guard cells.
+    else:
+        raise ValueError("Unrecognized `direction` : %s" % direction)
+
+    # Return the result
+    return( i, S )
+
+# ----------------------------
+# Angle initialization utility
+# ----------------------------
+
+def unalign_angles( thetap, Npz, Npr, method='irrational' ) :
+    """
+    Shift the angles so that the particles are
+    not all aligned along the arms of a star transversely
+
+    The fact that the particles are all aligned can produce
+    numerical artefacts, especially if the polarization of the laser
+    is aligned with this direction.
+
+    Here, for each position in r and z, we add the *same*
+    shift for all the Nptheta particles that are at this position.
+    (This preserves the fact that certain modes are 0 initially.)
+    How this shift varies from one position to another depends on
+    the method chosen.
+
+    Parameters
+    ----------
+    thetap : 3darray of floats
+        An array of shape (Npr, Npz, Nptheta) containing the angular
+        positions of the particles, and which is modified by this function.
+
+    Npz, Npr : ints
+        The number of macroparticles along the z and r directions
+
+    method : string
+        Either 'random' or 'irrational'
+    """
+    # Determine the angle shift
+    if method == 'random' :
+        angle_shift = 2*np.pi*np.random.rand(Npz, Npr)
+    elif method == 'irrational' :
+        # Subrandom sequence, by adding irrational number (sqrt(2) and sqrt(3))
+        # This ensures that the sequence does not wrap around and induce
+        # correlations
+        shiftr = np.sqrt(2)*np.arange(Npr)
+        shiftz = np.sqrt(3)*np.arange(Npz)
+        angle_shift = 2*np.pi*( shiftz[:,np.newaxis] + shiftr[np.newaxis,:] )
+        angle_shift = np.mod( angle_shift, 2*np.pi )
+    else :
+        raise ValueError(
+      "method must be either 'random' or 'irrational' but is %s" %method )
+
+    # Add the angle shift to thetap
+    # np.newaxis ensures that the angles that are at the same positions
+    # in r and z have the same shift
+    thetap[:,:,:] = thetap[:,:,:] + angle_shift[:,:, np.newaxis]

From 8f2e3ff8523c132faf522c77d85bbb524ee07339 Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Fri, 14 Jul 2017 19:05:43 +0200
Subject: [PATCH 03/36] Fix pyflakes errors

---
 fbpic/particles/deposition/numba_methods.py |  1 -
 fbpic/particles/gathering/cuda_methods.py   |  2 -
 fbpic/particles/particles.py                | 41 +++++++++++----------
 fbpic/particles/utilities/cuda_sorting.py   |  2 +
 4 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/fbpic/particles/deposition/numba_methods.py b/fbpic/particles/deposition/numba_methods.py
index 4ad61f9f9..54fedeb39 100644
--- a/fbpic/particles/deposition/numba_methods.py
+++ b/fbpic/particles/deposition/numba_methods.py
@@ -7,7 +7,6 @@
 order shapes on the CPU with numba.
 """
 import numba
-from scipy.constants import c, e
 
 @numba.njit
 def deposit_field_numba(Fptcl, Fgrid,
diff --git a/fbpic/particles/gathering/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py
index 6bd5a6c0c..392d3ef52 100644
--- a/fbpic/particles/gathering/cuda_methods.py
+++ b/fbpic/particles/gathering/cuda_methods.py
@@ -8,8 +8,6 @@
 """
 from numba import cuda, float64, int64
 import math
-from scipy.constants import c, e
-import numpy as np
 
 # -----------------------
 # Field gathering linear
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 0094e305e..73197b2b5 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -10,7 +10,6 @@
 from .ionization import Ionizer
 from .tracking import ParticleTracker
 import numba
-import math
 
 # Load the utility methods
 from .utilities.utility_methods import weights, unalign_angles
@@ -786,13 +785,14 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].rho, grid[1].rho,
                         self.nthreads, tx_chunks, tx_N )
                 elif self.particle_shape == 'cubic':
-                    deposit_rho_prange_cubic(
-                        self.x, self.y, self.z, self.w,
-                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
-                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        rho_m0_global, rho_m1_global,
-                        grid[0].rho, grid[1].rho,
-                        self.nthreads, tx_chunks, tx_N )
+                    print('Not yet implemented')
+                    # deposit_rho_prange_cubic(
+                    #     self.x, self.y, self.z, self.w,
+                    #     grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                    #     grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                    #     rho_m0_global, rho_m1_global,
+                    #     grid[0].rho, grid[1].rho,
+                    #     self.nthreads, tx_chunks, tx_N )
                 else:
                     raise ValueError("`particle_shape` should be either \
                                       'linear' or 'cubic' \
@@ -836,18 +836,19 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].Jz, grid[1].Jz,
                         self.nthreads, tx_chunks, tx_N )
                 elif self.particle_shape == 'cubic':
-                    deposit_J_prange_cubic(
-                        self.x, self.y, self.z, self.w,
-                        self.ux, self.uy, self.uz, self.inv_gamma,
-                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
-                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                        Jr_m0_global, Jr_m1_global,
-                        Jt_m0_global, Jt_m1_global,
-                        Jz_m0_global, Jz_m1_global,
-                        grid[0].Jr, grid[1].Jr,
-                        grid[0].Jt, grid[1].Jt,
-                        grid[0].Jz, grid[1].Jz,
-                        self.nthreads, tx_chunks, tx_N )
+                    print('Not yet implemented')
+                    # deposit_J_prange_cubic(
+                    #     self.x, self.y, self.z, self.w,
+                    #     self.ux, self.uy, self.uz, self.inv_gamma,
+                    #     grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                    #     grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                    #     Jr_m0_global, Jr_m1_global,
+                    #     Jt_m0_global, Jt_m1_global,
+                    #     Jz_m0_global, Jz_m1_global,
+                    #     grid[0].Jr, grid[1].Jr,
+                    #     grid[0].Jt, grid[1].Jt,
+                    #     grid[0].Jz, grid[1].Jz,
+                    #     self.nthreads, tx_chunks, tx_N )
                 else:
                     raise ValueError("`particle_shape` should be either \
                                       'linear' or 'cubic' \
diff --git a/fbpic/particles/utilities/cuda_sorting.py b/fbpic/particles/utilities/cuda_sorting.py
index cb0a9a09f..45a1ddb9f 100644
--- a/fbpic/particles/utilities/cuda_sorting.py
+++ b/fbpic/particles/utilities/cuda_sorting.py
@@ -7,6 +7,8 @@
 """
 from numba import cuda
 from accelerate.cuda import sorting
+import math
+import numpy as np
 
 # -----------------------------------------------------
 # Sorting utilities - get_cell_idx / sort / prefix_sum

From 380b326780bc20761003d9a675fe6d8262df99a2 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Fri, 14 Jul 2017 21:54:03 -0700
Subject: [PATCH 04/36] Print number of threads along with number of MPI procs

---
 fbpic/main.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fbpic/main.py b/fbpic/main.py
index f33554966..e3ce1e975 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -228,7 +228,7 @@ def dens_func( z, r ) ...
         self.comm = BoundaryCommunicator( Nz, zmin, zmax, Nr, rmax, Nm, dt,
             boundaries, n_order, n_guard, n_damp, exchange_period,
             use_all_mpi_ranks )
-        print_simulation_setup( self.comm, self.use_cuda )
+        print_simulation_setup( self.comm, self.use_cuda, self.use_threading )
         # Modify domain region
         zmin, zmax, p_zmin, p_zmax, Nz = \
               self.comm.divide_into_domain(zmin, zmax, p_zmin, p_zmax)
@@ -595,7 +595,7 @@ def progression_bar( i, Ntot, measured_start, Nbars=50, char='-'):
     sys.stdout.write(', %d:%02d:%02d left' % (h, m, s))
     sys.stdout.flush()
 
-def print_simulation_setup( comm, use_cuda ):
+def print_simulation_setup( comm, use_cuda, use_threading ):
     """
     Print message about the number of proc and
     whether it is using GPU or CPU.
@@ -607,13 +607,20 @@ def print_simulation_setup( comm, use_cuda ):
 
     use_cuda: bool
         Whether the simulation is set up to use CUDA
+
+    use_threading: bool
+        Whether the simulation is set up to use threads on CPU
     """
     if comm.rank == 0:
         if use_cuda:
             message = "\nRunning FBPIC on GPU "
         else:
             message = "\nRunning FBPIC on CPU "
-        message += "with %d proc.\n" %comm.size
+        message += "with %d proc" %comm.size
+        if use_threading and not use_cuda:
+            message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS
+        message += ".\n"
+            
         print( message )
 
 def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ):

From af15196a8758221aaeef9638beb508922bc565f8 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sat, 15 Jul 2017 18:29:23 -0700
Subject: [PATCH 05/36] Swapped the order of global arrays + removed
 thread-local arrays

---
 .../particles/deposition/threading_methods.py | 90 +++++++------------
 fbpic/particles/particles.py                  | 36 ++++----
 2 files changed, 49 insertions(+), 77 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 595d6bdb5..7f023f5b0 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -79,7 +79,6 @@ def deposit_rho_prange_linear(x, y, z, w,
                            invdz, zmin, Nz,
                            invdr, rmin, Nr,
                            rho_m0_global, rho_m1_global,
-                           rho_m0, rho_m1,
                            nthreads, tx_chunks, tx_N):
     """
     Deposition of the charge density rho using numba prange on the CPU.
@@ -120,8 +119,6 @@ def deposit_rho_prange_linear(x, y, z, w,
         # Create thread_local helper arrays
         # FIXME! ( instead of using zeros_like, 
         # it would be nicer to use np.zeros((Nz,Nr)) )
-        rho_m0_thread = np.zeros_like( rho_m0 )
-        rho_m1_thread = np.zeros_like( rho_m1 )
         # Loop over all particles in thread chunk
         for idx in range( tx_chunks[tx] ):
             # Calculate thread local particle index
@@ -203,21 +200,17 @@ def deposit_rho_prange_linear(x, y, z, w,
             if iz_cell+1 > Nz-1:
                 shift_z -= Nz
             # Write to thread local arrays
-            rho_m0_thread[iz_cell, ir_cell] += R_m0_00
-            rho_m1_thread[iz_cell, ir_cell] += R_m1_00
+            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00
+            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00
 
-            rho_m0_thread[iz_cell+1 + shift_z, ir_cell] += R_m0_01
-            rho_m1_thread[iz_cell+1 + shift_z, ir_cell] += R_m1_01
+            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01
+            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01
 
-            rho_m0_thread[iz_cell, ir_cell+1 + shift_r] += R_m0_10
-            rho_m1_thread[iz_cell, ir_cell+1 + shift_r] += R_m1_10
+            rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10
+            rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10
 
-            rho_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
-            rho_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
-
-        # Write thread local deposition arrays to global deposition arrays
-        rho_m0_global[:,:,tx] = rho_m0_thread
-        rho_m1_global[:,:,tx] = rho_m1_thread
+            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
+            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
 
     return
 
@@ -233,9 +226,6 @@ def deposit_J_prange_linear(x, y, z, w,
                          j_r_m0_global, j_r_m1_global,
                          j_t_m0_global, j_t_m1_global,
                          j_z_m0_global, j_z_m1_global,
-                         j_r_m0, j_r_m1,
-                         j_t_m0, j_t_m1,
-                         j_z_m0, j_z_m1,
                          nthreads, tx_chunks, tx_N):
     """
     Deposition of the current density J using numba prange on the CPU.
@@ -283,12 +273,6 @@ def deposit_J_prange_linear(x, y, z, w,
         # Create thread_local helper arrays
         # FIXME! ( instead of using zeros_like, 
         # it would be nicer to use np.zeros((Nz,Nr)) )
-        j_r_m0_thread = np.zeros_like( j_r_m0 )
-        j_t_m0_thread = np.zeros_like( j_t_m0 )
-        j_z_m0_thread = np.zeros_like( j_z_m0 )
-        j_r_m1_thread = np.zeros_like( j_r_m1 )
-        j_t_m1_thread = np.zeros_like( j_t_m1 )
-        j_z_m1_thread = np.zeros_like( j_z_m1 )
         # Loop over all particles in thread chunk
         for idx in range( tx_chunks[tx] ):
             # Calculate thread local particle index
@@ -407,48 +391,40 @@ def deposit_J_prange_linear(x, y, z, w,
             if (iz_cell+1) > Nz-1:
                 shift_z -= Nz
 
-            j_r_m0_thread[iz_cell, ir_cell] += J_r_m0_00
-            j_r_m1_thread[iz_cell, ir_cell] += J_r_m1_00
-
-            j_r_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
-            j_r_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
+            j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00
+            j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00
 
-            j_r_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
-            j_r_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
+            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
+            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
 
-            j_r_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
-            j_r_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
+            j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
+            j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
 
-            j_t_m0_thread[iz_cell, ir_cell] += J_t_m0_00
-            j_t_m1_thread[iz_cell, ir_cell] += J_t_m1_00
+            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
+            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
 
-            j_t_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
-            j_t_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
+            j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00
+            j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00
 
-            j_t_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
-            j_t_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
+            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
+            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
 
-            j_t_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
-            j_t_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
+            j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
+            j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
 
-            j_z_m0_thread[iz_cell, ir_cell] += J_z_m0_00
-            j_z_m1_thread[iz_cell, ir_cell] += J_z_m1_00
+            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
+            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
 
-            j_z_m0_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
-            j_z_m1_thread[iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
+            j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00
+            j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00
 
-            j_z_m0_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
-            j_z_m1_thread[iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
+            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
+            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
 
-            j_z_m0_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
-            j_z_m1_thread[iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
+            j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
+            j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
 
-        # Write thread local deposition arrays to global deposition arrays
-        j_r_m0_global[:,:,tx] = j_r_m0_thread
-        j_t_m0_global[:,:,tx] = j_t_m0_thread
-        j_z_m0_global[:,:,tx] = j_z_m0_thread
-        j_r_m1_global[:,:,tx] = j_r_m1_thread
-        j_t_m1_global[:,:,tx] = j_t_m1_thread
-        j_z_m1_global[:,:,tx] = j_z_m1_thread
+            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
+            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
 
-    return
\ No newline at end of file
+    return
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 73197b2b5..fa4497cc7 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -770,10 +770,10 @@ def deposit( self, fld, fieldtype ) :
             if fieldtype == 'rho':
                 # Generate temporary arrays for rho
                 rho_m0_global = np.zeros(
-                    (grid[0].rho.shape[0], grid[0].rho.shape[1], self.nthreads), 
+                    (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]), 
                     dtype=grid[0].rho.dtype )
                 rho_m1_global = np.zeros(
-                    (grid[1].rho.shape[0], grid[1].rho.shape[1], self.nthreads), 
+                    (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]), 
                     dtype=grid[1].rho.dtype )
                 # Deposit rho using CPU threading
                 if self.particle_shape == 'linear':
@@ -782,7 +782,6 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
                         rho_m0_global, rho_m1_global,
-                        grid[0].rho, grid[1].rho,
                         self.nthreads, tx_chunks, tx_N )
                 elif self.particle_shape == 'cubic':
                     print('Not yet implemented')
@@ -798,28 +797,28 @@ def deposit( self, fld, fieldtype ) :
                                       'linear' or 'cubic' \
                                        but is `%s`" % self.particle_shape)
                 # Sum thread-local results to main field array
-                grid[0].rho = np.sum(rho_m0_global, axis=2)
-                grid[1].rho = np.sum(rho_m1_global, axis=2)
+                grid[0].rho = np.sum(rho_m0_global, axis=0)
+                grid[1].rho = np.sum(rho_m1_global, axis=0)
 
             elif fieldtype == 'J':
                 # Generate temporary arrays for J
                 Jr_m0_global = np.zeros(
-                    (grid[0].Jr.shape[0], grid[0].Jr.shape[1], self.nthreads), 
+                    (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]), 
                     dtype=grid[0].Jr.dtype )
                 Jt_m0_global = np.zeros(
-                    (grid[0].Jt.shape[0], grid[0].Jt.shape[1], self.nthreads), 
+                    (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]), 
                     dtype=grid[0].Jt.dtype )
                 Jz_m0_global = np.zeros(
-                    (grid[0].Jz.shape[0], grid[0].Jz.shape[1], self.nthreads), 
+                    (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]), 
                     dtype=grid[0].Jz.dtype )
                 Jr_m1_global = np.zeros(
-                    (grid[1].Jr.shape[0], grid[1].Jr.shape[1], self.nthreads), 
+                    (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]), 
                     dtype=grid[1].Jr.dtype )
                 Jt_m1_global = np.zeros(
-                    (grid[1].Jt.shape[0], grid[1].Jt.shape[1], self.nthreads), 
+                    (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]), 
                     dtype=grid[1].Jt.dtype )
                 Jz_m1_global = np.zeros(
-                    (grid[1].Jz.shape[0], grid[1].Jz.shape[1], self.nthreads), 
+                    (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]), 
                     dtype=grid[1].Jz.dtype )
                 # Deposit J using CPU threading
                 if self.particle_shape == 'linear':
@@ -831,9 +830,6 @@ def deposit( self, fld, fieldtype ) :
                         Jr_m0_global, Jr_m1_global,
                         Jt_m0_global, Jt_m1_global,
                         Jz_m0_global, Jz_m1_global,
-                        grid[0].Jr, grid[1].Jr,
-                        grid[0].Jt, grid[1].Jt,
-                        grid[0].Jz, grid[1].Jz,
                         self.nthreads, tx_chunks, tx_N )
                 elif self.particle_shape == 'cubic':
                     print('Not yet implemented')
@@ -854,12 +850,12 @@ def deposit( self, fld, fieldtype ) :
                                       'linear' or 'cubic' \
                                        but is `%s`" % self.particle_shape)
                 # Sum thread-local results to main field array
-                grid[0].Jr = np.sum(Jr_m0_global, axis=2)
-                grid[0].Jt = np.sum(Jt_m0_global, axis=2)
-                grid[0].Jz = np.sum(Jz_m0_global, axis=2)
-                grid[1].Jr = np.sum(Jr_m1_global, axis=2)
-                grid[1].Jt = np.sum(Jt_m1_global, axis=2)
-                grid[1].Jz = np.sum(Jz_m1_global, axis=2)
+                grid[0].Jr = np.sum(Jr_m0_global, axis=0)
+                grid[0].Jt = np.sum(Jt_m0_global, axis=0)
+                grid[0].Jz = np.sum(Jz_m0_global, axis=0)
+                grid[1].Jr = np.sum(Jr_m1_global, axis=0)
+                grid[1].Jt = np.sum(Jt_m1_global, axis=0)
+                grid[1].Jz = np.sum(Jz_m1_global, axis=0)
 
             else:
                 raise ValueError("`fieldtype` should be either 'J' or \

From 1e252be4d50daf7aceaf6b962c6639400b6fe02f Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sat, 15 Jul 2017 18:49:48 -0700
Subject: [PATCH 06/36] Fix pyflakes errors

---
 fbpic/particles/deposition/threading_methods.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 7f023f5b0..04b46276a 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -10,7 +10,6 @@
 from numba import prange, int64
 import math
 from scipy.constants import c
-import numpy as np
 
 # -------------------------------
 # Particle shape Factor functions 

From f9ce67927d3ce134ccb7a2cc6b6e254c482b15f3 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sun, 16 Jul 2017 12:54:16 -0700
Subject: [PATCH 07/36] Corrected import pattern in laser antenna

---
 fbpic/lpa_utils/laser/antenna.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbpic/lpa_utils/laser/antenna.py b/fbpic/lpa_utils/laser/antenna.py
index 3f3e8425c..d1aaf10b0 100644
--- a/fbpic/lpa_utils/laser/antenna.py
+++ b/fbpic/lpa_utils/laser/antenna.py
@@ -10,8 +10,8 @@
 from scipy.constants import e, c, epsilon_0, physical_constants
 r_e = physical_constants['classical electron radius'][0]
 from .profiles import gaussian_profile
-from fbpic.particles.utility_methods import weights
-from fbpic.particles.numba_methods import deposit_field_numba
+from fbpic.particles.utilities.utility_methods import weights
+from fbpic.particles.deposition.numba_methods import deposit_field_numba
 
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed

From eedf0190df8aca9f2fdedb087a57cea1dc4014ce Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sun, 16 Jul 2017 13:12:51 -0700
Subject: [PATCH 08/36] Fix thread index calculation

---
 fbpic/particles/particles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index fa4497cc7..18422189f 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -764,7 +764,7 @@ def deposit( self, fld, fieldtype ) :
             # Register particle chunk size for each thread
             tx_N = int(self.Ntot/self.nthreads) 
             tx_chunks = [ tx_N for k in range(self.nthreads) ]
-            tx_chunks[-1] = tx_chunks[-1] + (tx_N)%(self.nthreads)
+            tx_chunks[-1] = tx_chunks[-1] + self.Ntot%(self.nthreads)
             # Multithreading functions for the deposition of rho or J
             # for Mode 0 and 1 only.
             if fieldtype == 'rho':

From 3a5f0f08f5c51c77fdb9c866db4a04ef05567626 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sun, 16 Jul 2017 21:31:09 -0700
Subject: [PATCH 09/36] Fix automated tests

---
 fbpic/particles/particles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 18422189f..818e38632 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -764,7 +764,7 @@ def deposit( self, fld, fieldtype ) :
             # Register particle chunk size for each thread
             tx_N = int(self.Ntot/self.nthreads) 
             tx_chunks = [ tx_N for k in range(self.nthreads) ]
-            tx_chunks[-1] = tx_chunks[-1] + self.Ntot%(self.nthreads)
+            tx_chunks[-1] = tx_chunks[-1] + int(self.Ntot%self.nthreads)
             # Multithreading functions for the deposition of rho or J
             # for Mode 0 and 1 only.
             if fieldtype == 'rho':

From a3c22482378690129efe9e514028e04feb78b29e Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sun, 16 Jul 2017 21:29:31 -0700
Subject: [PATCH 10/36] Implement parallel reduce

---
 .../particles/deposition/threading_methods.py | 27 ++++++++++++++++---
 fbpic/particles/particles.py                  | 18 ++++++-------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 04b46276a..02c8d7d9f 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -12,7 +12,7 @@
 from scipy.constants import c
 
 # -------------------------------
-# Particle shape Factor functions 
+# Particle shape Factor functions
 # -------------------------------
 
 # Linear shapes
@@ -116,7 +116,7 @@ def deposit_rho_prange_linear(x, y, z, w,
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
         # Create thread_local helper arrays
-        # FIXME! ( instead of using zeros_like, 
+        # FIXME! ( instead of using zeros_like,
         # it would be nicer to use np.zeros((Nz,Nr)) )
         # Loop over all particles in thread chunk
         for idx in range( tx_chunks[tx] ):
@@ -270,7 +270,7 @@ def deposit_J_prange_linear(x, y, z, w,
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
         # Create thread_local helper arrays
-        # FIXME! ( instead of using zeros_like, 
+        # FIXME! ( instead of using zeros_like,
         # it would be nicer to use np.zeros((Nz,Nr)) )
         # Loop over all particles in thread chunk
         for idx in range( tx_chunks[tx] ):
@@ -427,3 +427,24 @@ def deposit_J_prange_linear(x, y, z, w,
             j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
 
     return
+
+# -----------------------------------------------------------------------
+# Parallel reduction of the global arrays for threads into a single array
+# -----------------------------------------------------------------------
+
+@numba.njit( parallel=True )
+def sum_reduce_2d_array( global_array, reduced_array ):
+    """
+    # TODO
+    """
+    # Extract size of each dimension
+    Nreduce, Nz, Nr = global_array.shape
+
+    # Parallel loop over iz
+    for iz in prange( Nz ):
+        # Loop over the reduction dimension (slow dimension)
+        for it in range( Nreduce ):
+            # Loop over ir (fast dimension)
+            for ir in range( Nr ):
+
+                reduced_array[ iz, ir ] +=  global_array[ it, iz, ir ]
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 818e38632..d1893d38f 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -21,7 +21,7 @@
 from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
     push_x_prange
 from .deposition.threading_methods import deposit_rho_prange_linear, \
-    deposit_J_prange_linear #CUBIC tbd
+    deposit_J_prange_linear, sum_reduce_2d_array #CUBIC tbd
 from .gathering.threading_methods import gather_field_prange_linear, \
     gather_field_prange_cubic
 
@@ -797,8 +797,8 @@ def deposit( self, fld, fieldtype ) :
                                       'linear' or 'cubic' \
                                        but is `%s`" % self.particle_shape)
                 # Sum thread-local results to main field array
-                grid[0].rho = np.sum(rho_m0_global, axis=0)
-                grid[1].rho = np.sum(rho_m1_global, axis=0)
+                sum_reduce_2d_array( rho_m0_global, grid[0].rho )
+                sum_reduce_2d_array( rho_m1_global, grid[1].rho )
 
             elif fieldtype == 'J':
                 # Generate temporary arrays for J
@@ -850,12 +850,12 @@ def deposit( self, fld, fieldtype ) :
                                       'linear' or 'cubic' \
                                        but is `%s`" % self.particle_shape)
                 # Sum thread-local results to main field array
-                grid[0].Jr = np.sum(Jr_m0_global, axis=0)
-                grid[0].Jt = np.sum(Jt_m0_global, axis=0)
-                grid[0].Jz = np.sum(Jz_m0_global, axis=0)
-                grid[1].Jr = np.sum(Jr_m1_global, axis=0)
-                grid[1].Jt = np.sum(Jt_m1_global, axis=0)
-                grid[1].Jz = np.sum(Jz_m1_global, axis=0)
+                sum_reduce_2d_array( Jr_m0_global, grid[0].Jr )
+                sum_reduce_2d_array( Jt_m0_global, grid[0].Jt )
+                sum_reduce_2d_array( Jz_m0_global, grid[0].Jz )
+                sum_reduce_2d_array( Jr_m1_global, grid[1].Jr )
+                sum_reduce_2d_array( Jt_m1_global, grid[1].Jt )
+                sum_reduce_2d_array( Jz_m1_global, grid[1].Jz )
 
             else:
                 raise ValueError("`fieldtype` should be either 'J' or \

From da28bbe3d70489e6cb42c337686fe71f68c08877 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Sun, 16 Jul 2017 23:42:45 -0700
Subject: [PATCH 11/36] Added docstring to the function

---
 fbpic/particles/deposition/threading_methods.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 02c8d7d9f..cb4a3e84a 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -435,7 +435,17 @@ def deposit_J_prange_linear(x, y, z, w,
 @numba.njit( parallel=True )
 def sum_reduce_2d_array( global_array, reduced_array ):
     """
-    # TODO
+    Sum the array `global_array` along its first axis and 
+    add it into `reduced_array`.
+
+    Parameters:
+    -----------
+    global_array: 3darray of complexs
+       Field array whose first dimension corresponds to the 
+       reduction dimension (typically: the number of threads used
+       during the current deposition)
+
+    reduced array: 2darray of complexs
     """
     # Extract size of each dimension
     Nreduce, Nz, Nr = global_array.shape

From f70cca196102e155bf49f71b78dd0a98342ab2a6 Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Mon, 17 Jul 2017 17:11:23 +0200
Subject: [PATCH 12/36] Added cubic deposition functions

---
 .../particles/deposition/threading_methods.py | 1200 ++++++++++++++++-
 1 file changed, 1149 insertions(+), 51 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index cb4a3e84a..3cab552be 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -12,7 +12,7 @@
 from scipy.constants import c
 
 # -------------------------------
-# Particle shape Factor functions
+# Particle shape Factor functions 
 # -------------------------------
 
 # Linear shapes
@@ -78,13 +78,14 @@ def deposit_rho_prange_linear(x, y, z, w,
                            invdz, zmin, Nz,
                            invdr, rmin, Nr,
                            rho_m0_global, rho_m1_global,
-                           nthreads, tx_chunks, tx_N):
+                           nthreads, tx_chunks):
     """
     Deposition of the charge density rho using numba prange on the CPU.
     Iterates over the threads in parallel, while each thread iterates
     over a batch of particles. Intermediate results for each threads are
     stored in copies of the global grid. At the end of the parallel loop,
-    the thread-local field arrays are combined (summed) to the global array.
+    the thread-local field arrays are combined (summed) to a global array.
+    (This final reduction is *not* done in this function)
 
     Calculates the weighted amount of rho that is deposited to the
     4 cells surounding the particle based on its shape (linear).
@@ -99,9 +100,10 @@ def deposit_rho_prange_linear(x, y, z, w,
     w : 1d array of floats
         The weights of the particles
 
-    rho_m0, rho_m1 : 2darrays of complexs
-        The charge density on the interpolation grid for
-        mode 0 and 1. (is modified by this function)
+    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local charge densities
+        on the interpolation grid for mode 0 and 1. 
+        (is modified by this function)
 
     invdz, invdr : float (in meters^-1)
         Inverse of the grid step along the considered direction
@@ -112,16 +114,20 @@ def deposit_rho_prange_linear(x, y, z, w,
 
     Nz, Nr : int
         Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
     """
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
-        # Create thread_local helper arrays
-        # FIXME! ( instead of using zeros_like,
-        # it would be nicer to use np.zeros((Nz,Nr)) )
         # Loop over all particles in thread chunk
         for idx in range( tx_chunks[tx] ):
             # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_N
+            ptcl_idx = idx + tx*tx_chunks[0]
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
@@ -175,10 +181,21 @@ def deposit_rho_prange_linear(x, y, z, w,
             # Boundary Region Shifts
             ir_flip = int( math.floor(r_cell) )
 
-            R_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
-            R_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
-            R_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
-            R_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
+            # Declare local field array
+            R_m0_00 = 0.
+            R_m0_01 = 0.
+            R_m0_10 = 0.
+            R_m0_11 = 0.
+
+            R_m1_00 = 0. + 0.j
+            R_m1_01 = 0. + 0.j
+            R_m1_10 = 0. + 0.j
+            R_m1_11 = 0. + 0.j
+
+            R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
+            R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
+            R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
+            R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
 
             if ir_flip == -1:
                 R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
@@ -186,10 +203,10 @@ def deposit_rho_prange_linear(x, y, z, w,
                 R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
                 R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
             else:
-                R_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+                R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
 
             # Cell shifts for the simulation boundaries
             shift_r = 0
@@ -198,7 +215,8 @@ def deposit_rho_prange_linear(x, y, z, w,
                 shift_r = -1
             if iz_cell+1 > Nz-1:
                 shift_z -= Nz
-            # Write to thread local arrays
+
+            # Write ptcl fields to thread-local part of global deposition array
             rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00
             rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00
 
@@ -225,13 +243,14 @@ def deposit_J_prange_linear(x, y, z, w,
                          j_r_m0_global, j_r_m1_global,
                          j_t_m0_global, j_t_m1_global,
                          j_z_m0_global, j_z_m1_global,
-                         nthreads, tx_chunks, tx_N):
+                         nthreads, tx_chunks):
     """
     Deposition of the current density J using numba prange on the CPU.
     Iterates over the threads in parallel, while each thread iterates
     over a batch of particles. Intermediate results for each threads are
     stored in copies of the global grid. At the end of the parallel loop,
     the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
 
     Calculates the weighted amount of J that is deposited to the
     4 cells surounding the particle based on its shape (linear).
@@ -252,9 +271,9 @@ def deposit_J_prange_linear(x, y, z, w,
     inv_gamma : 1darray of floats
         The inverse of the relativistic gamma factor
 
-    j_r_m0, j_r_m1, j_t_m0, j_t_m1, j_z_m0, j_z_m1,: 2darrays of complexs
-        The current component in each direction (r, t, z)
-        on the interpolation grid for mode 0 and 1.
+    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local current component
+        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
         (is modified by this function)
 
     invdz, invdr : float (in meters^-1)
@@ -266,16 +285,20 @@ def deposit_J_prange_linear(x, y, z, w,
 
     Nz, Nr : int
         Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
     """
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
-        # Create thread_local helper arrays
-        # FIXME! ( instead of using zeros_like,
-        # it would be nicer to use np.zeros((Nz,Nr)) )
         # Loop over all particles in thread chunk
         for idx in range( tx_chunks[tx] ):
             # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_N
+            ptcl_idx = idx + tx*tx_chunks[0]
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
@@ -341,18 +364,47 @@ def deposit_J_prange_linear(x, y, z, w,
             # Boundary Region Shifts
             ir_flip = int( math.floor(r_cell) )
 
-            J_r_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
-            J_t_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
-            J_z_m0_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
-            J_r_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
-            J_t_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
-            J_z_m0_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
-            J_r_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
-            J_t_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
-            J_z_m1_00 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
-            J_r_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
-            J_t_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
-            J_z_m1_01 = r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+            # Declare local field arrays
+            J_r_m0_00 = 0.
+            J_r_m1_00 = 0. + 0.j
+            J_t_m0_00 = 0.
+            J_t_m1_00 = 0. + 0.j
+            J_z_m0_00 = 0.
+            J_z_m1_00 = 0. + 0.j
+
+            J_r_m0_01 = 0.
+            J_r_m1_01 = 0. + 0.j
+            J_t_m0_01 = 0.
+            J_t_m1_01 = 0. + 0.j
+            J_z_m0_01 = 0.
+            J_z_m1_01 = 0. + 0.j
+
+            J_r_m0_10 = 0.
+            J_r_m1_10 = 0. + 0.j
+            J_t_m0_10 = 0.
+            J_t_m1_10 = 0. + 0.j
+            J_z_m0_10 = 0.
+            J_z_m1_10 = 0. + 0.j
+
+            J_r_m0_11 = 0.
+            J_r_m1_11 = 0. + 0.j
+            J_t_m0_11 = 0.
+            J_t_m1_11 = 0. + 0.j
+            J_z_m0_11 = 0.
+            J_z_m1_11 = 0. + 0.j
+
+            J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+            J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+            J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+            J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+            J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+            J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+            J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+            J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+            J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+            J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+            J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+            J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
 
             # Take into account lower r flips
             if ir_flip == -1:
@@ -369,18 +421,18 @@ def deposit_J_prange_linear(x, y, z, w,
                 J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
                 J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
             else:
-                J_r_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_10 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_11 = r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+                J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
 
             # Cell shifts for the simulation boundaries
             shift_r = 0
@@ -390,6 +442,7 @@ def deposit_J_prange_linear(x, y, z, w,
             if (iz_cell+1) > Nz-1:
                 shift_z -= Nz
 
+            # Write ptcl fields to thread-local part of global deposition array
             j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00
             j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00
 
@@ -428,6 +481,1051 @@ def deposit_J_prange_linear(x, y, z, w,
 
     return
 
+
+# -------------------------------
+# Field deposition - cubic - rho
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_rho_prange_cubic(x, y, z, w,
+                          invdz, zmin, Nz,
+                          invdr, rmin, Nr,
+                          rho_m0_global, rho_m1_global,
+                          nthreads, tx_chunks):
+    """
+
+    Deposition of the charge density rho using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of rho that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local charge densities
+        on the interpolation grid for mode 0 and 1. 
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_chunks[0]
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Compute values in local copies and consider boundaries
+            ir_flip = int( math.floor(r_cell) ) - 1
+
+            # Declare the local field value for
+            # all possible deposition directions,
+            # depending on the shape order and per mode.
+            R_m0_00 = 0.
+            R_m1_00 = 0. + 0.j
+
+            R_m0_01 = 0.
+            R_m1_01 = 0. + 0.j
+
+            R_m0_02 = 0.
+            R_m1_02 = 0. + 0.j
+
+            R_m0_03 = 0.
+            R_m1_03 = 0. + 0.j
+
+            R_m0_10 = 0.
+            R_m1_10 = 0. + 0.j
+
+            R_m0_11 = 0.
+            R_m1_11 = 0. + 0.j
+
+            R_m0_12 = 0.
+            R_m1_12 = 0. + 0.j
+
+            R_m0_13 = 0.
+            R_m1_13 = 0. + 0.j
+
+            R_m0_20 = 0.
+            R_m1_20 = 0. + 0.j
+
+            R_m0_21 = 0.
+            R_m1_21 = 0. + 0.j
+
+            R_m0_22 = 0.
+            R_m1_22 = 0. + 0.j
+
+            R_m0_23 = 0.
+            R_m1_23 = 0. + 0.j
+
+            R_m0_30 = 0.
+            R_m1_30 = 0. + 0.j
+
+            R_m0_31 = 0.
+            R_m1_31 = 0. + 0.j
+
+            R_m0_32 = 0.
+            R_m1_32 = 0. + 0.j
+
+            R_m0_33 = 0.
+            R_m1_33 = 0. + 0.j
+
+            if (ir_flip == -2):
+                R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            if (ir_flip == -1):
+                R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+            if (ir_flip >= 0):
+                R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            # Index Shifting since local copies are centered around
+            # the current cell
+            srl = 0         # shift r lower
+            sru = 0         # shift r upper inner
+            sru2 = 0        # shift r upper outer
+            szl = 0         # shift z lower
+            szu = 0         # shift z upper inner
+            szu2 = 0        # shift z upper outer
+            if (iz_cell-1) < 0:
+                szl += Nz
+            if (iz_cell) == (Nz - 1):
+                szu -= Nz
+                szu2 -= Nz
+            if (iz_cell+1) == (Nz - 1):
+                szu2 -= Nz
+            if (ir_cell) >= (Nr - 1):
+                sru = -1
+                sru2 = -2
+            if (ir_cell+1) == (Nr - 1):
+                sru2 = -1
+            if (ir_cell-1) < 0:
+                srl = 1
+
+            # Write ptcl fields to thread-local part of global deposition array
+            rho_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
+            rho_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
+
+            rho_m0_global[iz_cell, ir_cell - 1 + srl] += R_m0_01
+            rho_m1_global[iz_cell, ir_cell - 1 + srl] += R_m1_01
+
+            rho_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
+            rho_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
+
+            rho_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
+            rho_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
+
+            rho_m0_global[iz_cell - 1 + szl, ir_cell] += R_m0_10
+            rho_m1_global[iz_cell - 1 + szl, ir_cell] += R_m1_10
+
+            rho_m0_global[iz_cell, ir_cell] += R_m0_11
+            rho_m1_global[iz_cell, ir_cell] += R_m1_11
+
+            rho_m0_global[iz_cell + 1 + szu, ir_cell] += R_m0_12
+            rho_m1_global[iz_cell + 1 + szu, ir_cell] += R_m1_12
+
+            rho_m0_global[iz_cell + 2 + szu2, ir_cell] += R_m0_13
+            rho_m1_global[iz_cell + 2 + szu2, ir_cell] += R_m1_13
+
+            rho_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
+            rho_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
+
+            rho_m0_global[iz_cell, ir_cell + 1 + sru] += R_m0_21
+            rho_m1_global[iz_cell, ir_cell + 1 + sru] += R_m1_21
+
+            rho_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
+            rho_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
+
+            rho_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
+            rho_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
+
+            rho_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
+            rho_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
+
+            rho_m0_global[iz_cell, ir_cell + 2 + sru2] += R_m0_31
+            rho_m1_global[iz_cell, ir_cell + 2 + sru2] += R_m1_31
+
+            rho_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
+            rho_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
+
+            rho_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
+            rho_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
+
+    return
+
+# -------------------------------
+# Field deposition - cubic - J
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_J_prange_cubic(x, y, z, w,
+                        ux, uy, uz, inv_gamma,
+                        invdz, zmin, Nz,
+                        invdr, rmin, Nr,
+                        j_r_m0_global, j_r_m1_global,
+                        j_t_m0_global, j_t_m1_global,
+                        j_z_m0_global, j_z_m1_global,
+                        nthreads, tx_chunks):
+    """
+    Deposition of the current density J using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of J that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local current component
+        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_chunks[0]
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Compute values in local copies and consider boundaries
+            ir_flip = int64(math.floor(r_cell)) - 1
+
+            # Declare the local field value for
+            # all possible deposition directions,
+            # depending on the shape order and per mode for r,t and z.
+            J_r_m0_00 = 0.
+            J_t_m0_00 = 0.
+            J_z_m0_00 = 0.
+            J_r_m1_00 = 0. + 0.j
+            J_t_m1_00 = 0. + 0.j
+            J_z_m1_00 = 0. + 0.j
+
+            J_r_m0_01 = 0.
+            J_t_m0_01 = 0.
+            J_z_m0_01 = 0.
+            J_r_m1_01 = 0. + 0.j
+            J_t_m1_01 = 0. + 0.j
+            J_z_m1_01 = 0. + 0.j
+
+            J_r_m0_02 = 0.
+            J_t_m0_02 = 0.
+            J_z_m0_02 = 0.
+            J_r_m1_02 = 0. + 0.j
+            J_t_m1_02 = 0. + 0.j
+            J_z_m1_02 = 0. + 0.j
+
+            J_r_m0_03 = 0.
+            J_t_m0_03 = 0.
+            J_z_m0_03 = 0.
+            J_r_m1_03 = 0. + 0.j
+            J_t_m1_03 = 0. + 0.j
+            J_z_m1_03 = 0. + 0.j
+
+            J_r_m0_10 = 0.
+            J_t_m0_10 = 0.
+            J_z_m0_10 = 0.
+            J_r_m1_10 = 0. + 0.j
+            J_t_m1_10 = 0. + 0.j
+            J_z_m1_10 = 0. + 0.j
+
+            J_r_m0_11 = 0.
+            J_t_m0_11 = 0.
+            J_z_m0_11 = 0.
+            J_r_m1_11 = 0. + 0.j
+            J_t_m1_11 = 0. + 0.j
+            J_z_m1_11 = 0. + 0.j
+
+            J_r_m0_12 = 0.
+            J_t_m0_12 = 0.
+            J_z_m0_12 = 0.
+            J_r_m1_12 = 0. + 0.j
+            J_t_m1_12 = 0. + 0.j
+            J_z_m1_12 = 0. + 0.j
+
+            J_r_m0_13 = 0.
+            J_t_m0_13 = 0.
+            J_z_m0_13 = 0.
+            J_r_m1_13 = 0. + 0.j
+            J_t_m1_13 = 0. + 0.j
+            J_z_m1_13 = 0. + 0.j
+
+            J_r_m0_20 = 0.
+            J_t_m0_20 = 0.
+            J_z_m0_20 = 0.
+            J_r_m1_20 = 0. + 0.j
+            J_t_m1_20 = 0. + 0.j
+            J_z_m1_20 = 0. + 0.j
+
+            J_r_m0_21 = 0.
+            J_t_m0_21 = 0.
+            J_z_m0_21 = 0.
+            J_r_m1_21 = 0. + 0.j
+            J_t_m1_21 = 0. + 0.j
+            J_z_m1_21 = 0. + 0.j
+
+            J_r_m0_22 = 0.
+            J_t_m0_22 = 0.
+            J_z_m0_22 = 0.
+            J_r_m1_22 = 0. + 0.j
+            J_t_m1_22 = 0. + 0.j
+            J_z_m1_22 = 0. + 0.j
+
+            J_r_m0_23 = 0.
+            J_t_m0_23 = 0.
+            J_z_m0_23 = 0.
+            J_r_m1_23 = 0. + 0.j
+            J_t_m1_23 = 0. + 0.j
+            J_z_m1_23 = 0. + 0.j
+
+            J_r_m0_30 = 0.
+            J_t_m0_30 = 0.
+            J_z_m0_30 = 0.
+            J_r_m1_30 = 0. + 0.j
+            J_t_m1_30 = 0. + 0.j
+            J_z_m1_30 = 0. + 0.j
+
+            J_r_m0_31 = 0.
+            J_t_m0_31 = 0.
+            J_z_m0_31 = 0.
+            J_r_m1_31 = 0. + 0.j
+            J_t_m1_31 = 0. + 0.j
+            J_z_m1_31 = 0. + 0.j
+
+            J_r_m0_32 = 0.
+            J_t_m0_32 = 0.
+            J_z_m0_32 = 0.
+            J_r_m1_32 = 0. + 0.j
+            J_t_m1_32 = 0. + 0.j
+            J_z_m1_32 = 0. + 0.j
+
+            J_r_m0_33 = 0.
+            J_t_m0_33 = 0.
+            J_z_m0_33 = 0.
+            J_r_m1_33 = 0. + 0.j
+            J_t_m1_33 = 0. + 0.j
+            J_z_m1_33 = 0. + 0.j
+
+            if (ir_flip == -2):
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+
+                J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir_flip == -1):
+                J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir_flip >= 0):
+                J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+            # Index Shifting since local copies are centered around
+            # the current cell
+            srl = 0         # shift r lower
+            sru = 0         # shift r upper inner
+            sru2 = 0        # shift r upper outer
+            szl = 0         # shift z lower
+            szu = 0         # shift z upper inner
+            szu2 = 0        # shift z upper outer
+            if (iz_cell-1) < 0:
+                szl += Nz
+            if (iz_cell) == (Nz - 1):
+                szu -= Nz
+                szu2 -= Nz
+            if (iz_cell+1) == (Nz - 1):
+                szu2 -= Nz
+            if (ir_cell) >= (Nr - 1):
+                sru = -1
+                sru2 = -2
+            if (ir_cell+1) == (Nr - 1):
+                sru2 = -1
+            if (ir_cell-1) < 0:
+            ] +=  srl = 1
+
+            j_r_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
+            j_r_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
+            j_r_m0_global[iz_cell, ir_cell - 1 + srl] += J_r_m0_01
+            j_r_m1_global[iz_cell, ir_cell - 1 + srl] += J_r_m1_01
+            j_r_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
+            j_r_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
+            j_r_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
+            j_r_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
+            j_r_m0_global[iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
+            j_r_m1_global[iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
+            j_r_m0_global[iz_cell, ir_cell] += J_r_m0_11
+            j_r_m1_global[iz_cell, ir_cell] += J_r_m1_11
+            j_r_m0_global[iz_cell + 1 + szu, ir_cell] += J_r_m0_12
+            j_r_m1_global[iz_cell + 1 + szu, ir_cell] += J_r_m1_12
+            j_r_m0_global[iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
+            j_r_m1_global[iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
+            j_r_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
+            j_r_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
+            j_r_m0_global[iz_cell, ir_cell + 1 + sru] += J_r_m0_21
+            j_r_m1_global[iz_cell, ir_cell + 1 + sru] += J_r_m1_21
+            j_r_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
+            j_r_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
+            j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
+            j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
+            j_r_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
+            j_r_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
+            j_r_m0_global[iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
+            j_r_m1_global[iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
+            j_r_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
+            j_r_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
+            j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
+            j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
+
+            j_t_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
+            j_t_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
+            j_t_m0_global[iz_cell, ir_cell - 1 + srl] += J_t_m0_01
+            j_t_m1_global[iz_cell, ir_cell - 1 + srl] += J_t_m1_01
+            j_t_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
+            j_t_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
+            j_t_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
+            j_t_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
+            j_t_m0_global[iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
+            j_t_m1_global[iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
+            j_t_m0_global[iz_cell, ir_cell] += J_t_m0_11
+            j_t_m1_global[iz_cell, ir_cell] += J_t_m1_11
+            j_t_m0_global[iz_cell + 1 + szu, ir_cell] += J_t_m0_12
+            j_t_m1_global[iz_cell + 1 + szu, ir_cell] += J_t_m1_12
+            j_t_m0_global[iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
+            j_t_m1_global[iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
+            j_t_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
+            j_t_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
+            j_t_m0_global[iz_cell, ir_cell + 1 + sru] += J_t_m0_21
+            j_t_m1_global[iz_cell, ir_cell + 1 + sru] += J_t_m1_21
+            j_t_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
+            j_t_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
+            j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
+            j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
+            j_t_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
+            j_t_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
+            j_t_m0_global[iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
+            j_t_m1_global[iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
+            j_t_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
+            j_t_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
+            j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
+            j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
+
+            j_z_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
+            j_z_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
+            j_z_m0_global[iz_cell, ir_cell - 1 + srl] += J_z_m0_01
+            j_z_m1_global[iz_cell, ir_cell - 1 + srl] += J_z_m1_01
+            j_z_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
+            j_z_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
+            j_z_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
+            j_z_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
+            j_z_m0_global[iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
+            j_z_m1_global[iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
+            j_z_m0_global[iz_cell, ir_cell] += J_z_m0_11
+            j_z_m1_global[iz_cell, ir_cell] += J_z_m1_11
+            j_z_m0_global[iz_cell + 1 + szu, ir_cell] += J_z_m0_12
+            j_z_m1_global[iz_cell + 1 + szu, ir_cell] += J_z_m1_12
+            j_z_m0_global[iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
+            j_z_m1_global[iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
+            j_z_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
+            j_z_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
+            j_z_m0_global[iz_cell, ir_cell + 1 + sru] += J_z_m0_21
+            j_z_m1_global[iz_cell, ir_cell + 1 + sru] += J_z_m1_21
+            j_z_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
+            j_z_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
+            j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
+            j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
+            j_z_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
+            j_z_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
+            j_z_m0_global[iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
+            j_z_m1_global[iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
+            j_z_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
+            j_z_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
+            j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
+            j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
+
+    return
+
 # -----------------------------------------------------------------------
 # Parallel reduction of the global arrays for threads into a single array
 # -----------------------------------------------------------------------

From 2053e1f674a40b5f8c97aa49b188c124976721e0 Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Mon, 17 Jul 2017 17:14:47 +0200
Subject: [PATCH 13/36] Adapted particles.py for cubic prange deposition

---
 fbpic/particles/particles.py | 43 ++++++++++++++++--------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index d1893d38f..a64434753 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -21,7 +21,8 @@
 from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
     push_x_prange
 from .deposition.threading_methods import deposit_rho_prange_linear, \
-    deposit_J_prange_linear, sum_reduce_2d_array #CUBIC tbd
+    deposit_J_prange_linear, deposit_rho_prange_cubic,
+    deposit_J_prange_cubic, sum_reduce_2d_array 
 from .gathering.threading_methods import gather_field_prange_linear, \
     gather_field_prange_cubic
 
@@ -782,16 +783,14 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
                         rho_m0_global, rho_m1_global,
-                        self.nthreads, tx_chunks, tx_N )
+                        self.nthreads, tx_chunks )
                 elif self.particle_shape == 'cubic':
-                    print('Not yet implemented')
-                    # deposit_rho_prange_cubic(
-                    #     self.x, self.y, self.z, self.w,
-                    #     grid[0].invdz, grid[0].zmin, grid[0].Nz,
-                    #     grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                    #     rho_m0_global, rho_m1_global,
-                    #     grid[0].rho, grid[1].rho,
-                    #     self.nthreads, tx_chunks, tx_N )
+                    deposit_rho_prange_cubic(
+                        self.x, self.y, self.z, self.w,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        rho_m0_global, rho_m1_global,
+                        self.nthreads, tx_chunks )
                 else:
                     raise ValueError("`particle_shape` should be either \
                                       'linear' or 'cubic' \
@@ -830,21 +829,17 @@ def deposit( self, fld, fieldtype ) :
                         Jr_m0_global, Jr_m1_global,
                         Jt_m0_global, Jt_m1_global,
                         Jz_m0_global, Jz_m1_global,
-                        self.nthreads, tx_chunks, tx_N )
+                        self.nthreads, tx_chunks )
                 elif self.particle_shape == 'cubic':
-                    print('Not yet implemented')
-                    # deposit_J_prange_cubic(
-                    #     self.x, self.y, self.z, self.w,
-                    #     self.ux, self.uy, self.uz, self.inv_gamma,
-                    #     grid[0].invdz, grid[0].zmin, grid[0].Nz,
-                    #     grid[0].invdr, grid[0].rmin, grid[0].Nr,
-                    #     Jr_m0_global, Jr_m1_global,
-                    #     Jt_m0_global, Jt_m1_global,
-                    #     Jz_m0_global, Jz_m1_global,
-                    #     grid[0].Jr, grid[1].Jr,
-                    #     grid[0].Jt, grid[1].Jt,
-                    #     grid[0].Jz, grid[1].Jz,
-                    #     self.nthreads, tx_chunks, tx_N )
+                    deposit_J_prange_cubic(
+                        self.x, self.y, self.z, self.w,
+                        self.ux, self.uy, self.uz, self.inv_gamma,
+                        grid[0].invdz, grid[0].zmin, grid[0].Nz,
+                        grid[0].invdr, grid[0].rmin, grid[0].Nr,
+                        Jr_m0_global, Jr_m1_global,
+                        Jt_m0_global, Jt_m1_global,
+                        Jz_m0_global, Jz_m1_global,
+                        self.nthreads, tx_chunks )
                 else:
                     raise ValueError("`particle_shape` should be either \
                                       'linear' or 'cubic' \

From 392354dff6009ebed9bfc5dee3cff2c5b89111d3 Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Mon, 17 Jul 2017 17:18:10 +0200
Subject: [PATCH 14/36] Removed linear_non_atomic shape from uniform_rho test

---
 tests/test_uniform_rho_deposition.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_uniform_rho_deposition.py b/tests/test_uniform_rho_deposition.py
index e0a7978e6..8978f9625 100644
--- a/tests/test_uniform_rho_deposition.py
+++ b/tests/test_uniform_rho_deposition.py
@@ -10,7 +10,7 @@
   (i.e. this confirms that no Verboncoeur-type correction is needed)
 - Shifting this plasma by a small amount in r, and still verifying
   that the deposited density is uniform
- The tests are performed with different particle shapes: linear
+ The tests are performed with different particle shapes: linear, cubic
 
 Usage :
 from the top-level directory of FBPIC run
@@ -47,7 +47,7 @@
 
 def test_uniform_electron_plasma(show=False):
     "Function that is run by py.test, when doing `python setup.py test`"
-    for shape in ['cubic', 'linear', 'linear_non_atomic']:
+    for shape in ['linear', 'cubic']:
         uniform_electron_plasma( shape, show )
 
 def uniform_electron_plasma(shape, show=False):
@@ -87,7 +87,7 @@ def uniform_electron_plasma(shape, show=False):
 
 def test_neutral_plasma_shifted(show=False):
     "Function that is run by py.test, when doing `python setup.py test`"
-    for shape in ['cubic', 'linear', 'linear_non_atomic']:
+    for shape in ['linear', 'cubic']:
         neutral_plasma_shifted( shape, show )
 
 def neutral_plasma_shifted(shape, show=False):

From 705584511199a4061d1963509e5a27da3ce846cf Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Mon, 17 Jul 2017 17:23:46 +0200
Subject: [PATCH 15/36] Corrected some bugs introduced in last commits

---
 fbpic/particles/deposition/threading_methods.py | 2 +-
 fbpic/particles/particles.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 3cab552be..55d53bbf0 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -1423,7 +1423,7 @@ def deposit_J_prange_cubic(x, y, z, w,
             if (ir_cell+1) == (Nr - 1):
                 sru2 = -1
             if (ir_cell-1) < 0:
-            ] +=  srl = 1
+                srl = 1
 
             j_r_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
             j_r_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index a64434753..cf8401d25 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -21,7 +21,7 @@
 from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
     push_x_prange
 from .deposition.threading_methods import deposit_rho_prange_linear, \
-    deposit_J_prange_linear, deposit_rho_prange_cubic,
+    deposit_J_prange_linear, deposit_rho_prange_cubic, \
     deposit_J_prange_cubic, sum_reduce_2d_array 
 from .gathering.threading_methods import gather_field_prange_linear, \
     gather_field_prange_cubic

From 0a22108a1517fd8dc6fa46d67f9c1423a63116df Mon Sep 17 00:00:00 2001
From: Manuel Kirchen <manuel.kirchen@desy.de>
Date: Mon, 17 Jul 2017 19:00:21 +0200
Subject: [PATCH 16/36] Fix cubic deposition and cubic gathering

---
 .../particles/deposition/threading_methods.py | 275 +++++++++---------
 fbpic/particles/gathering/cuda_methods.py     |   2 +-
 .../particles/gathering/threading_methods.py  |  49 +++-
 3 files changed, 166 insertions(+), 160 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 55d53bbf0..11ecb54c8 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -780,53 +780,38 @@ def deposit_rho_prange_cubic(x, y, z, w,
                 srl = 1
 
             # Write ptcl fields to thread-local part of global deposition array
-            rho_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
-            rho_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
-
-            rho_m0_global[iz_cell, ir_cell - 1 + srl] += R_m0_01
-            rho_m1_global[iz_cell, ir_cell - 1 + srl] += R_m1_01
-
-            rho_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
-            rho_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
-
-            rho_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
-            rho_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
-
-            rho_m0_global[iz_cell - 1 + szl, ir_cell] += R_m0_10
-            rho_m1_global[iz_cell - 1 + szl, ir_cell] += R_m1_10
-
-            rho_m0_global[iz_cell, ir_cell] += R_m0_11
-            rho_m1_global[iz_cell, ir_cell] += R_m1_11
-
-            rho_m0_global[iz_cell + 1 + szu, ir_cell] += R_m0_12
-            rho_m1_global[iz_cell + 1 + szu, ir_cell] += R_m1_12
-
-            rho_m0_global[iz_cell + 2 + szu2, ir_cell] += R_m0_13
-            rho_m1_global[iz_cell + 2 + szu2, ir_cell] += R_m1_13
-
-            rho_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
-            rho_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
-
-            rho_m0_global[iz_cell, ir_cell + 1 + sru] += R_m0_21
-            rho_m1_global[iz_cell, ir_cell + 1 + sru] += R_m1_21
-
-            rho_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
-            rho_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
-
-            rho_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
-            rho_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
-
-            rho_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
-            rho_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
-
-            rho_m0_global[iz_cell, ir_cell + 2 + sru2] += R_m0_31
-            rho_m1_global[iz_cell, ir_cell + 2 + sru2] += R_m1_31
-
-            rho_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
-            rho_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
-
-            rho_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
-            rho_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
+            rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01
+            rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10
+            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11
+            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
+            rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21
+            rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
+            rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31
+            rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
 
     return
 
@@ -1425,104 +1410,104 @@ def deposit_J_prange_cubic(x, y, z, w,
             if (ir_cell-1) < 0:
                 srl = 1
 
-            j_r_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
-            j_r_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
-            j_r_m0_global[iz_cell, ir_cell - 1 + srl] += J_r_m0_01
-            j_r_m1_global[iz_cell, ir_cell - 1 + srl] += J_r_m1_01
-            j_r_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
-            j_r_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
-            j_r_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
-            j_r_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
-            j_r_m0_global[iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
-            j_r_m1_global[iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
-            j_r_m0_global[iz_cell, ir_cell] += J_r_m0_11
-            j_r_m1_global[iz_cell, ir_cell] += J_r_m1_11
-            j_r_m0_global[iz_cell + 1 + szu, ir_cell] += J_r_m0_12
-            j_r_m1_global[iz_cell + 1 + szu, ir_cell] += J_r_m1_12
-            j_r_m0_global[iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
-            j_r_m1_global[iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
-            j_r_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
-            j_r_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
-            j_r_m0_global[iz_cell, ir_cell + 1 + sru] += J_r_m0_21
-            j_r_m1_global[iz_cell, ir_cell + 1 + sru] += J_r_m1_21
-            j_r_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
-            j_r_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
-            j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
-            j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
-            j_r_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
-            j_r_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
-            j_r_m0_global[iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
-            j_r_m1_global[iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
-            j_r_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
-            j_r_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
-            j_r_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
-            j_r_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
-
-            j_t_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
-            j_t_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
-            j_t_m0_global[iz_cell, ir_cell - 1 + srl] += J_t_m0_01
-            j_t_m1_global[iz_cell, ir_cell - 1 + srl] += J_t_m1_01
-            j_t_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
-            j_t_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
-            j_t_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
-            j_t_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
-            j_t_m0_global[iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
-            j_t_m1_global[iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
-            j_t_m0_global[iz_cell, ir_cell] += J_t_m0_11
-            j_t_m1_global[iz_cell, ir_cell] += J_t_m1_11
-            j_t_m0_global[iz_cell + 1 + szu, ir_cell] += J_t_m0_12
-            j_t_m1_global[iz_cell + 1 + szu, ir_cell] += J_t_m1_12
-            j_t_m0_global[iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
-            j_t_m1_global[iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
-            j_t_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
-            j_t_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
-            j_t_m0_global[iz_cell, ir_cell + 1 + sru] += J_t_m0_21
-            j_t_m1_global[iz_cell, ir_cell + 1 + sru] += J_t_m1_21
-            j_t_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
-            j_t_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
-            j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
-            j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
-            j_t_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
-            j_t_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
-            j_t_m0_global[iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
-            j_t_m1_global[iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
-            j_t_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
-            j_t_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
-            j_t_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
-            j_t_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
-
-            j_z_m0_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
-            j_z_m1_global[iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
-            j_z_m0_global[iz_cell, ir_cell - 1 + srl] += J_z_m0_01
-            j_z_m1_global[iz_cell, ir_cell - 1 + srl] += J_z_m1_01
-            j_z_m0_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
-            j_z_m1_global[iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
-            j_z_m0_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
-            j_z_m1_global[iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
-            j_z_m0_global[iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
-            j_z_m1_global[iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
-            j_z_m0_global[iz_cell, ir_cell] += J_z_m0_11
-            j_z_m1_global[iz_cell, ir_cell] += J_z_m1_11
-            j_z_m0_global[iz_cell + 1 + szu, ir_cell] += J_z_m0_12
-            j_z_m1_global[iz_cell + 1 + szu, ir_cell] += J_z_m1_12
-            j_z_m0_global[iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
-            j_z_m1_global[iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
-            j_z_m0_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
-            j_z_m1_global[iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
-            j_z_m0_global[iz_cell, ir_cell + 1 + sru] += J_z_m0_21
-            j_z_m1_global[iz_cell, ir_cell + 1 + sru] += J_z_m1_21
-            j_z_m0_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
-            j_z_m1_global[iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
-            j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
-            j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
-            j_z_m0_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
-            j_z_m1_global[iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
-            j_z_m0_global[iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
-            j_z_m1_global[iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
-            j_z_m0_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
-            j_z_m1_global[iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
-            j_z_m0_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
-            j_z_m1_global[iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
+            j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01
+            j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
+            j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11
+            j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
+            j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21
+            j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
+            j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
+            j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
+
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
+            j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01
+            j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
+            j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11
+            j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
+            j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21
+            j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
+            j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
+            j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
+
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
+            j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01
+            j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
+            j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11
+            j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
+            j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21
+            j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
+            j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
+            j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
 
     return
 
diff --git a/fbpic/particles/gathering/cuda_methods.py b/fbpic/particles/gathering/cuda_methods.py
index 392d3ef52..12c763860 100644
--- a/fbpic/particles/gathering/cuda_methods.py
+++ b/fbpic/particles/gathering/cuda_methods.py
@@ -430,8 +430,8 @@ def gather_field_gpu_cubic(x, y, z,
         z_cell = invdz*(zj - zmin) - 0.5
 
         # Calculate the shape factors
-        Sr = cuda.local.array((4,), dtype=float64)
         ir = cuda.local.array((4,), dtype=int64)
+        Sr = cuda.local.array((4,), dtype=float64)
         ir[0] = int64(math.floor(r_cell)) - 1
         ir[1] = ir[0] + 1
         ir[2] = ir[1] + 1
diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py
index 73da99cfe..2c996a5ff 100644
--- a/fbpic/particles/gathering/threading_methods.py
+++ b/fbpic/particles/gathering/threading_methods.py
@@ -9,6 +9,7 @@
 from numba import prange, int64
 import numba
 import math
+import numpy as np
 
 # -----------------------
 # Field gathering linear
@@ -411,8 +412,8 @@ def gather_field_prange_cubic(x, y, z,
         z_cell = invdz*(zj - zmin) - 0.5
 
         # Calculate the shape factors
-        Sr = [0.,0.,0.,0.]
-        ir = [0,0,0,0]
+        ir = np.zeros(4, dtype=int64) #[0,0,0,0]
+        Sr = np.zeros(4) #[0.,0.,0.,0.]
         ir[0] = int64(math.floor(r_cell)) - 1
         ir[1] = ir[0] + 1
         ir[2] = ir[1] + 1
@@ -421,8 +422,8 @@ def gather_field_prange_cubic(x, y, z,
         Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
         Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
         Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
-        iz = [0.,0.,0.,0.]
-        Sz = [0,0,0,0]
+        iz = np.zeros(4, dtype=int64) #[0,0,0,0]
+        Sz = np.zeros(4) #[0.,0.,0.,0.]
         iz[0] = int64(math.floor(z_cell)) - 1
         iz[1] = iz[0] + 1
         iz[2] = iz[1] + 1
@@ -432,18 +433,22 @@ def gather_field_prange_cubic(x, y, z,
         Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
         Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
         # Lower and upper periodic boundary for z
-        for index_z in range(4):
+        index_z = 0
+        while index_z < 4:
             if iz[index_z] < 0:
                 iz[index_z] += Nz
             if iz[index_z] > Nz - 1:
                 iz[index_z] -= Nz
+            index_z += 1
         # Lower and upper boundary for r
-        for index_r in range(4):
+        index_r = 0
+        while index_r < 4:
             if ir[index_r] < 0:
                 ir[index_r] = abs(ir[index_r])-1
                 Sr[index_r] = (-1.)*Sr[index_r]
             if ir[index_r] > Nr - 1:
                 ir[index_r] = Nr - 1
+            index_r += 1
 
         # E-Field
         # ----------------------------
@@ -461,8 +466,10 @@ def gather_field_prange_cubic(x, y, z,
         Ft_m = 0.j
         Fz_m = 0.j
         # Add the fields for mode 0
-        for index_r in range(4):
-            for index_z in range(4):
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
                 Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
                 Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
                 if Sz[index_z]*Sr[index_r] < 0:
@@ -471,6 +478,8 @@ def gather_field_prange_cubic(x, y, z,
                 else:
                     Fz_m += Sz[index_z]*Sr[index_r]* \
                         Ez_m0[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
 
         Fr += (Fr_m*exptheta_m0).real
         Ft += (Ft_m*exptheta_m0).real
@@ -484,8 +493,10 @@ def gather_field_prange_cubic(x, y, z,
         Ft_m = 0.j
         Fz_m = 0.j
         # Add the fields for mode 1
-        for index_r in range(4):
-            for index_z in range(4):
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
                 if Sz[index_z]*Sr[index_r] < 0:
                     Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
                                 Er_m1[iz[index_z], ir[index_r]]
@@ -497,6 +508,8 @@ def gather_field_prange_cubic(x, y, z,
                     Ft_m += Sz[index_z]*Sr[index_r]* \
                                 Et_m1[iz[index_z], ir[index_r]]
                 Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
 
         # Add the fields from the mode 1
         Fr += 2*(Fr_m*exptheta_m1).real
@@ -525,8 +538,10 @@ def gather_field_prange_cubic(x, y, z,
         Ft_m = 0.j
         Fz_m = 0.j
         # Add the fields for mode 0
-        for index_r in range(4):
-            for index_z in range(4):
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
                 Fr_m += Sz[index_z]*Sr[index_r]* \
                     Br_m0[iz[index_z], ir[index_r]]
                 Ft_m += Sz[index_z]*Sr[index_r]* \
@@ -537,6 +552,8 @@ def gather_field_prange_cubic(x, y, z,
                 else:
                     Fz_m += Sz[index_z]*Sr[index_r]* \
                         Bz_m0[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
 
         # Add the fields from the mode 0
         Fr += (Fr_m*exptheta_m0).real
@@ -552,8 +569,10 @@ def gather_field_prange_cubic(x, y, z,
         Fz_m = 0.j
 
         # Add the fields for mode 1
-        for index_r in range(4):
-            for index_z in range(4):
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
                 if Sz[index_z]*Sr[index_r] < 0:
                     Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
                         Br_m1[iz[index_z], ir[index_r]]
@@ -565,6 +584,8 @@ def gather_field_prange_cubic(x, y, z,
                     Ft_m += Sz[index_z]*Sr[index_r]* \
                         Bt_m1[iz[index_z], ir[index_r]]
                 Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
 
         # Add the fields from the mode 1
         Fr += 2*(Fr_m*exptheta_m1).real

From c2f75b3889fc7a130e35feb7aea50dc8cc2ac124 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 21:55:46 -0700
Subject: [PATCH 17/36] Remove function signature in field methods

---
 fbpic/fields/numba_methods.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
index 830fec37a..b6c8d98a9 100644
--- a/fbpic/fields/numba_methods.py
+++ b/fbpic/fields/numba_methods.py
@@ -9,10 +9,7 @@
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
 
-@numba.jit('void(complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64, int32, int32)')
+@numba.njit
 def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2, inv_dt, Nz, Nr ):
     """
@@ -33,13 +30,7 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \
-           int8, int32, int32)')
+@numba.njit
 def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -106,7 +97,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
 
-@numba.jit
+@numba.njit
 def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2,
                             j_corr_coef, T_eb, T_cc,
@@ -130,7 +121,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit
+@numba.njit
 def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,

From 18486c13d694301841a7214a3cfb6fb86919a6fc Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 22:14:56 -0700
Subject: [PATCH 18/36] Create threading_utils.py

---
 fbpic/threading_utils.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 fbpic/threading_utils.py

diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py
new file mode 100644
index 000000000..1fe2d4057
--- /dev/null
+++ b/fbpic/threading_utils.py
@@ -0,0 +1,15 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines a set of generic functions for multithreaded CPU execution.
+"""
+try:
+    # Try to import the threading function prange
+    from numba import prange
+    threading_installed = True
+except ImportError:
+    # If not replace threading functions by single-thread functions
+    prange = range
+    threading_installed = False

From fce66929e2e1cf9259dd52229f0c85d77d3e64b0 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 22:18:28 -0700
Subject: [PATCH 19/36] Check if threading is installed in main.py

---
 fbpic/main.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fbpic/main.py b/fbpic/main.py
index e3ce1e975..a87604771 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -11,6 +11,8 @@
 # as it sets the cuda context)
 from mpi4py import MPI
 import numba
+# Check if threading is available
+from .threading_utils import threading_installed
 # Check if CUDA is available, then import CUDA functions
 from .cuda_utils import cuda_installed
 if cuda_installed:
@@ -44,8 +46,8 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax,
                  n_order=-1, dens_func=None, filter_currents=True,
                  v_comoving=None, use_galilean=False, initialize_ions=False,
                  use_cuda=False, use_threading=True, nthreads=None,
-                 n_guard=None, n_damp=30, exchange_period=None, 
-                 boundaries='periodic', gamma_boost=None, 
+                 n_guard=None, n_damp=30, exchange_period=None,
+                 boundaries='periodic', gamma_boost=None,
                  use_all_mpi_ranks=True, particle_shape='linear' ):
         """
         Initializes a simulation, by creating the following structures:
@@ -200,6 +202,10 @@ def dens_func( z, r ) ...
             self.use_cuda = False
         # CPU multi-threading
         self.use_threading = use_threading
+        if (use_threading) and (threading_installed=False):
+            print('*** Threading not available for the simulation.')
+            print('*** (Please make sure that numba>0.34 is installed)')
+            self.use_threading = False
         if self.use_threading:
             # Define number of threads used
             if nthreads is not None:
@@ -620,7 +626,7 @@ def print_simulation_setup( comm, use_cuda, use_threading ):
         if use_threading and not use_cuda:
             message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS
         message += ".\n"
-            
+
         print( message )
 
 def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ):

From b6c35844ea3b3fd0b3aabf0846a45aff33925144 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 22:22:59 -0700
Subject: [PATCH 20/36] Added threaded methods for the fields

---
 fbpic/fields/numba_methods.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
index b6c8d98a9..0428ae216 100644
--- a/fbpic/fields/numba_methods.py
+++ b/fbpic/fields/numba_methods.py
@@ -8,15 +8,16 @@
 import numba
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
+from fbpic.threading_utils import threading_installed, prange
 
-@numba.njit
+@numba.njit( parallel=threading_installed )
 def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2, inv_dt, Nz, Nr ):
     """
     Correct the currents in spectral space, using the standard pstad
     """
     # Loop over the 2D grid
-    for iz in range(Nz):
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -30,7 +31,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.njit
+    return
+
+@numba.njit( parallel=threading_installed )
 def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -42,7 +45,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
     See the documentation of SpectralGrid.push_eb_with
     """
     # Loop over the 2D grid
-    for iz in range(Nz):
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Save the electric fields, since it is needed for the B push
@@ -97,7 +100,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
 
-@numba.njit
+    return
+
+@numba.njit( parallel=threading_installed )
 def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2,
                             j_corr_coef, T_eb, T_cc,
@@ -107,7 +112,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
     of comoving currents
     """
     # Loop over the 2D grid
-    for iz in range(Nz):
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -121,7 +126,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.njit
+    return
+
+@numba.njit( parallel=threading_installed )
 def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -198,3 +205,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                             + 1.j*kr[iz, ir]*Em_old ) \
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
+
+    return

From 0c52a62a7403412d24c9e583cc46c89bb6e7dee5 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 22:51:52 -0700
Subject: [PATCH 21/36] Added parallel capability for grid methods

---
 fbpic/fields/numba_methods.py | 16 +++++++-------
 fbpic/main.py                 |  2 +-
 fbpic/particles/particles.py  | 39 +++++++++++++++++++----------------
 fbpic/threading_utils.py      |  4 ++++
 4 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
index 0428ae216..f45e10a23 100644
--- a/fbpic/fields/numba_methods.py
+++ b/fbpic/fields/numba_methods.py
@@ -8,15 +8,15 @@
 import numba
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
-from fbpic.threading_utils import threading_installed, prange
+from fbpic.threading_utils import njit_parallel, prange
 
-@numba.njit( parallel=threading_installed )
+@njit_parallel
 def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2, inv_dt, Nz, Nr ):
     """
     Correct the currents in spectral space, using the standard pstad
     """
-    # Loop over the 2D grid
+    # Loop over the 2D grid (parallel in z, if threading is installed)
     for iz in prange(Nz):
         for ir in range(Nr):
 
@@ -33,7 +33,7 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
 
     return
 
-@numba.njit( parallel=threading_installed )
+@njit_parallel
 def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -44,7 +44,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
 
     See the documentation of SpectralGrid.push_eb_with
     """
-    # Loop over the 2D grid
+    # Loop over the 2D grid (parallel in z, if threading is installed)
     for iz in prange(Nz):
         for ir in range(Nr):
 
@@ -102,7 +102,7 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
 
     return
 
-@numba.njit( parallel=threading_installed )
+@njit_parallel
 def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2,
                             j_corr_coef, T_eb, T_cc,
@@ -111,7 +111,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
     Correct the currents in spectral space, using the assumption
     of comoving currents
     """
-    # Loop over the 2D grid
+    # Loop over the 2D grid (parallel in z, if threading is installed)
     for iz in prange(Nz):
         for ir in range(Nr):
 
@@ -128,7 +128,7 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
 
     return
 
-@numba.njit( parallel=threading_installed )
+@njit_parallel
 def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
diff --git a/fbpic/main.py b/fbpic/main.py
index a87604771..068491ffc 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -202,7 +202,7 @@ def dens_func( z, r ) ...
             self.use_cuda = False
         # CPU multi-threading
         self.use_threading = use_threading
-        if (use_threading) and (threading_installed=False):
+        if (use_threading) and (threading_installed==False):
             print('*** Threading not available for the simulation.')
             print('*** (Please make sure that numba>0.34 is installed)')
             self.use_threading = False
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index cf8401d25..ac9f2cdfd 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -17,14 +17,17 @@
 from .push.numba_methods import push_p_numba, push_p_ioniz_numba, push_x_numba
 from .deposition.numba_methods import deposit_field_numba
 from .gathering.numba_methods import gather_field_numba
-# Load the numba CPU multi-threading methods
-from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
-    push_x_prange
-from .deposition.threading_methods import deposit_rho_prange_linear, \
-    deposit_J_prange_linear, deposit_rho_prange_cubic, \
-    deposit_J_prange_cubic, sum_reduce_2d_array 
-from .gathering.threading_methods import gather_field_prange_linear, \
-    gather_field_prange_cubic
+
+# Check if threading is available, then import threaded functions
+from fbpic.threading_utils import threading_installed
+if threading_installed:
+    from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
+        push_x_prange
+    from .deposition.threading_methods import deposit_rho_prange_linear, \
+        deposit_J_prange_linear, deposit_rho_prange_cubic, \
+        deposit_J_prange_cubic, sum_reduce_2d_array
+    from .gathering.threading_methods import gather_field_prange_linear, \
+        gather_field_prange_cubic
 
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
@@ -505,7 +508,7 @@ def halfpush_x( self ) :
         elif self.use_threading:
             push_x_prange( self.x, self.y, self.z,
                 self.ux, self.uy, self.uz,
-                self.inv_gamma, self.Ntot, self.dt ) 
+                self.inv_gamma, self.Ntot, self.dt )
         # CPU single-core version
         else:
             push_x_numba( self.x, self.y, self.z,
@@ -763,7 +766,7 @@ def deposit( self, fld, fieldtype ) :
         # CPU multi-threading version
         elif self.use_threading:
             # Register particle chunk size for each thread
-            tx_N = int(self.Ntot/self.nthreads) 
+            tx_N = int(self.Ntot/self.nthreads)
             tx_chunks = [ tx_N for k in range(self.nthreads) ]
             tx_chunks[-1] = tx_chunks[-1] + int(self.Ntot%self.nthreads)
             # Multithreading functions for the deposition of rho or J
@@ -771,10 +774,10 @@ def deposit( self, fld, fieldtype ) :
             if fieldtype == 'rho':
                 # Generate temporary arrays for rho
                 rho_m0_global = np.zeros(
-                    (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]), 
+                    (self.nthreads, grid[0].rho.shape[0], grid[0].rho.shape[1]),
                     dtype=grid[0].rho.dtype )
                 rho_m1_global = np.zeros(
-                    (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]), 
+                    (self.nthreads, grid[1].rho.shape[0], grid[1].rho.shape[1]),
                     dtype=grid[1].rho.dtype )
                 # Deposit rho using CPU threading
                 if self.particle_shape == 'linear':
@@ -802,22 +805,22 @@ def deposit( self, fld, fieldtype ) :
             elif fieldtype == 'J':
                 # Generate temporary arrays for J
                 Jr_m0_global = np.zeros(
-                    (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]), 
+                    (self.nthreads, grid[0].Jr.shape[0], grid[0].Jr.shape[1]),
                     dtype=grid[0].Jr.dtype )
                 Jt_m0_global = np.zeros(
-                    (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]), 
+                    (self.nthreads, grid[0].Jt.shape[0], grid[0].Jt.shape[1]),
                     dtype=grid[0].Jt.dtype )
                 Jz_m0_global = np.zeros(
-                    (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]), 
+                    (self.nthreads, grid[0].Jz.shape[0], grid[0].Jz.shape[1]),
                     dtype=grid[0].Jz.dtype )
                 Jr_m1_global = np.zeros(
-                    (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]), 
+                    (self.nthreads, grid[1].Jr.shape[0], grid[1].Jr.shape[1]),
                     dtype=grid[1].Jr.dtype )
                 Jt_m1_global = np.zeros(
-                    (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]), 
+                    (self.nthreads, grid[1].Jt.shape[0], grid[1].Jt.shape[1]),
                     dtype=grid[1].Jt.dtype )
                 Jz_m1_global = np.zeros(
-                    (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]), 
+                    (self.nthreads, grid[1].Jz.shape[0], grid[1].Jz.shape[1]),
                     dtype=grid[1].Jz.dtype )
                 # Deposit J using CPU threading
                 if self.particle_shape == 'linear':
diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py
index 1fe2d4057..28bf0e28b 100644
--- a/fbpic/threading_utils.py
+++ b/fbpic/threading_utils.py
@@ -5,11 +5,15 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines a set of generic functions for multithreaded CPU execution.
 """
+from numba import njit
+
 try:
     # Try to import the threading function prange
     from numba import prange
     threading_installed = True
+    njit_parallel = njit( parallel=True )
 except ImportError:
     # If not replace threading functions by single-thread functions
     prange = range
     threading_installed = False
+    njit_parallel = njit

From 8458a0d81395165fbab00700291e499c270e4ce4 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 23:02:34 -0700
Subject: [PATCH 22/36] Removed threaded push methods

---
 fbpic/fields/numba_methods.py             |   1 -
 fbpic/particles/particles.py              |  23 +----
 fbpic/particles/push/numba_methods.py     |  29 +++---
 fbpic/particles/push/threading_methods.py | 114 ----------------------
 4 files changed, 20 insertions(+), 147 deletions(-)
 delete mode 100644 fbpic/particles/push/threading_methods.py

diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
index f45e10a23..e80652181 100644
--- a/fbpic/fields/numba_methods.py
+++ b/fbpic/fields/numba_methods.py
@@ -5,7 +5,6 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines the optimized fields methods that use numba on a CPU
 """
-import numba
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
 from fbpic.threading_utils import njit_parallel, prange
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index ac9f2cdfd..d3075e14f 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -21,8 +21,6 @@
 # Check if threading is available, then import threaded functions
 from fbpic.threading_utils import threading_installed
 if threading_installed:
-    from .push.threading_methods import push_p_prange, push_p_ioniz_prange, \
-        push_x_prange
     from .deposition.threading_methods import deposit_rho_prange_linear, \
         deposit_J_prange_linear, deposit_rho_prange_cubic, \
         deposit_J_prange_cubic, sum_reduce_2d_array
@@ -460,19 +458,7 @@ def push_p( self ) :
                     self.Ex, self.Ey, self.Ez,
                     self.Bx, self.By, self.Bz,
                     self.m, self.Ntot, self.dt, self.ionizer.ionization_level )
-        # CPU multi-threading version
-        elif self.use_threading:
-            if self.ionizer is None:
-                push_p_prange(self.ux, self.uy, self.uz, self.inv_gamma,
-                    self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz,
-                    self.q, self.m, self.Ntot, self.dt )
-            else:
-                # Ionizable species can have a charge that depends on the
-                # macroparticle, and hence require a different function
-                push_p_ioniz_prange(self.ux, self.uy, self.uz, self.inv_gamma,
-                    self.Ex, self.Ey, self.Ez, self.Bx, self.By, self.Bz,
-                    self.m, self.Ntot, self.dt, self.ionizer.ionization_level )
-        # CPU single-core version
+        # CPU version
         else:
             if self.ionizer is None:
                 push_p_numba(self.ux, self.uy, self.uz, self.inv_gamma,
@@ -504,12 +490,7 @@ def halfpush_x( self ) :
                 self.inv_gamma, self.dt )
             # The particle array is unsorted after the push in x
             self.sorted = False
-        # CPU multi-threading version
-        elif self.use_threading:
-            push_x_prange( self.x, self.y, self.z,
-                self.ux, self.uy, self.uz,
-                self.inv_gamma, self.Ntot, self.dt )
-        # CPU single-core version
+        # CPU version
         else:
             push_x_numba( self.x, self.y, self.z,
                 self.ux, self.uy, self.uz,
diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py
index 6e5807085..d7b0817ba 100644
--- a/fbpic/particles/push/numba_methods.py
+++ b/fbpic/particles/push/numba_methods.py
@@ -5,11 +5,12 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines the particle push methods on the CPU with numba.
 """
-import numba
 import math
+import numba
+from fbpic.threading_utils import njit_parallel, prange
 from scipy.constants import c, e
 
-@numba.njit
+@njit_parallel
 def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
     """
     Advance the particles' positions over one half-timestep
@@ -21,13 +22,15 @@ def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
     # Half timestep, multiplied by c
     chdt = c*0.5*dt
 
-    # Particle push
-    for ip in range(Ntot) :
+    # Particle push (in parallel if threading is installed)
+    for ip in prange(Ntot) :
         x[ip] += chdt * inv_gamma[ip] * ux[ip]
         y[ip] += chdt * inv_gamma[ip] * uy[ip]
         z[ip] += chdt * inv_gamma[ip] * uz[ip]
 
-@numba.njit
+    return
+
+@njit_parallel
 def push_p_numba( ux, uy, uz, inv_gamma,
                 Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
     """
@@ -37,13 +40,15 @@ def push_p_numba( ux, uy, uz, inv_gamma,
     econst = q*dt/(m*c)
     bconst = 0.5*q*dt/m
 
-    # Loop over the particles
-    for ip in range(Ntot) :
+    # Loop over the particles (in parallel if threading is installed)
+    for ip in prange(Ntot) :
         ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
             ux[ip], uy[ip], uz[ip], inv_gamma[ip],
             Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
 
-@numba.njit
+    return
+
+@njit_parallel
 def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
                 Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
     """
@@ -53,8 +58,8 @@ def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
     prefactor_econst = e*dt/(m*c)
     prefactor_bconst = 0.5*e*dt/m
 
-    # Loop over the particles
-    for ip in range(Ntot) :
+    # Loop over the particles (in parallel if threading is installed)
+    for ip in prange(Ntot) :
 
         # For neutral macroparticles, skip this step
         if ionization_level[ip] == 0:
@@ -69,6 +74,8 @@ def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
             Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
             econst, bconst )
 
+    return
+
 @numba.njit
 def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
                 Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
@@ -107,4 +114,4 @@ def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
     uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
     uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
 
-    return( ux_f, uy_f, uz_f, inv_gamma_f )
\ No newline at end of file
+    return( ux_f, uy_f, uz_f, inv_gamma_f )
diff --git a/fbpic/particles/push/threading_methods.py b/fbpic/particles/push/threading_methods.py
deleted file mode 100644
index 06205dbfc..000000000
--- a/fbpic/particles/push/threading_methods.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the particle push methods on the CPU with threading.
-"""
-import numba
-from numba import prange
-import math
-from scipy.constants import c, e
-
-@numba.njit(parallel=True)
-def push_x_prange( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
-    """
-    Advance the particles' positions over one half-timestep
-
-    This assumes that the positions (x, y, z) are initially either
-    one half-timestep *behind* the momenta (ux, uy, uz), or at the
-    same timestep as the momenta.
-    """
-    # Half timestep, multiplied by c
-    chdt = c*0.5*dt
-
-    # Particle push in parallel
-    for ip in prange(Ntot) :
-        x[ip] += chdt * inv_gamma[ip] * ux[ip]
-        y[ip] += chdt * inv_gamma[ip] * uy[ip]
-        z[ip] += chdt * inv_gamma[ip] * uz[ip]
-    return x, y, z
-
-@numba.njit(parallel=True)
-def push_p_prange( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz, q, m, Ntot, dt ) :
-    """
-    Advance the particles' momenta, using numba
-    """
-    # Set a few constants
-    econst = q*dt/(m*c)
-    bconst = 0.5*q*dt/m
-
-    # Loop over the particles in parallel
-    for ip in prange(Ntot) :
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
-    return ux, uy, uz, inv_gamma
-
-@numba.njit(parallel=True)
-def push_p_ioniz_prange( ux, uy, uz, inv_gamma,
-                Ex, Ey, Ez, Bx, By, Bz, m, Ntot, dt, ionization_level ) :
-    """
-    Advance the particles' momenta, using numba
-    """
-    # Set a few constants
-    prefactor_econst = e*dt/(m*c)
-    prefactor_bconst = 0.5*e*dt/m
-
-    # Loop over the particles in parallel
-    for ip in prange(Ntot) :
-
-        # For neutral macroparticles, skip this step
-        if ionization_level[ip] == 0:
-            continue
-
-        # Calculate the charge dependent constants
-        econst = prefactor_econst * ionization_level[ip]
-        bconst = prefactor_bconst * ionization_level[ip]
-        # Perform the push
-        ux[ip], uy[ip], uz[ip], inv_gamma[ip] = push_p_vay(
-            ux[ip], uy[ip], uz[ip], inv_gamma[ip],
-            Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
-            econst, bconst )
-    return ux, uy, uz, inv_gamma
-
-@numba.njit
-def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,
-                Ex, Ey, Ez, Bx, By, Bz, econst, bconst ):
-    """
-    Push at single macroparticle, using the Vay pusher
-    """
-    # Get the magnetic rotation vector
-    taux = bconst*Bx
-    tauy = bconst*By
-    tauz = bconst*Bz
-    tau2 = taux**2 + tauy**2 + tauz**2
-
-    # Get the momenta at the half timestep
-    uxp = ux_i + econst*Ex \
-    + inv_gamma_i*( uy_i*tauz - uz_i*tauy )
-    uyp = uy_i + econst*Ey \
-    + inv_gamma_i*( uz_i*taux - ux_i*tauz )
-    uzp = uz_i + econst*Ez \
-    + inv_gamma_i*( ux_i*tauy - uy_i*taux )
-    sigma = 1 + uxp**2 + uyp**2 + uzp**2 - tau2
-    utau = uxp*taux + uyp*tauy + uzp*tauz
-
-    # Get the new 1./gamma
-    inv_gamma_f = math.sqrt(
-        2./( sigma + math.sqrt( sigma**2 + 4*(tau2 + utau**2 ) ) ) )
-
-    # Reuse the tau and utau variables to save memory
-    tx = inv_gamma_f*taux
-    ty = inv_gamma_f*tauy
-    tz = inv_gamma_f*tauz
-    ut = inv_gamma_f*utau
-    s = 1./( 1 + tau2*inv_gamma_f**2 )
-
-    # Get the new u
-    ux_f = s*( uxp + tx*ut + uyp*tz - uzp*ty )
-    uy_f = s*( uyp + ty*ut + uzp*tx - uxp*tz )
-    uz_f = s*( uzp + tz*ut + uxp*ty - uyp*tx )
-
-    return( ux_f, uy_f, uz_f, inv_gamma_f )
\ No newline at end of file

From 6317aa3ad7c51e884ec81a7c326f1838189508e0 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 23:12:12 -0700
Subject: [PATCH 23/36] Corrected push_x's return

---
 fbpic/particles/push/numba_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py
index d7b0817ba..6bc1b1d18 100644
--- a/fbpic/particles/push/numba_methods.py
+++ b/fbpic/particles/push/numba_methods.py
@@ -28,7 +28,7 @@ def push_x_numba( x, y, z, ux, uy, uz, inv_gamma, Ntot, dt ):
         y[ip] += chdt * inv_gamma[ip] * uy[ip]
         z[ip] += chdt * inv_gamma[ip] * uz[ip]
 
-    return
+    return x, y, z
 
 @njit_parallel
 def push_p_numba( ux, uy, uz, inv_gamma,

From c817b44141190557d9c9f0ebae3d3411b7dcec50 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 18 Jul 2017 23:28:26 -0700
Subject: [PATCH 24/36] Correct push_p and push_x with return function

---
 fbpic/particles/particles.py          | 8 +++++++-
 fbpic/particles/push/numba_methods.py | 4 ++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index d3075e14f..300d07b12 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -130,12 +130,18 @@ def dens_func( z, r ) ...
         # Register the timestep
         self.dt = dt
 
-        # Define wether or not to use the GPU
+        # Define whether or not to use the GPU
         self.use_cuda = use_cuda
         if (self.use_cuda==True) and (cuda_installed==False) :
             print('*** Cuda not available for the particles.')
             print('*** Performing the particle operations on the CPU.')
             self.use_cuda = False
+        # Define whether or not to use threading
+        self.use_threading = use_threading
+        if (self.use_threading==True) and (threading_installed==False) :
+            print('*** Threading not available for the simulation.')
+            print('*** (Please make sure that numba>0.34 is installed)')
+            self.use_threading = False
 
         # Register the properties of the particles
         # (Necessary for the pusher, and when adding more particles later, )
diff --git a/fbpic/particles/push/numba_methods.py b/fbpic/particles/push/numba_methods.py
index 6bc1b1d18..6e3843e1b 100644
--- a/fbpic/particles/push/numba_methods.py
+++ b/fbpic/particles/push/numba_methods.py
@@ -46,7 +46,7 @@ def push_p_numba( ux, uy, uz, inv_gamma,
             ux[ip], uy[ip], uz[ip], inv_gamma[ip],
             Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip], econst, bconst )
 
-    return
+    return ux, uy, uz, inv_gamma
 
 @njit_parallel
 def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
@@ -74,7 +74,7 @@ def push_p_ioniz_numba( ux, uy, uz, inv_gamma,
             Ex[ip], Ey[ip], Ez[ip], Bx[ip], By[ip], Bz[ip],
             econst, bconst )
 
-    return
+    return ux, uy, uz, inv_gamma
 
 @numba.njit
 def push_p_vay( ux_i, uy_i, uz_i, inv_gamma_i,

From f1f2ab2f9d18c7a5b1a95506ccdbcf4898d236aa Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Wed, 19 Jul 2017 07:46:14 -0700
Subject: [PATCH 25/36] Give the right threading flag to particles

---
 fbpic/particles/particles.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 300d07b12..136a79b33 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -234,7 +234,6 @@ def dens_func( z, r ) ...
             # Register boolean that records if the particles are sorted or not
             self.sorted = False
         # Register variables when using multithreading
-        self.use_threading = use_threading
         if self.use_threading == True:
             # Register number of threads
             self.nthreads = numba.config.NUMBA_NUM_THREADS

From 54bd6f0bd46af8bdaba61da21959fbcd52efbec3 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Wed, 19 Jul 2017 12:02:49 -0700
Subject: [PATCH 26/36] Threaded the routines that convert from p/m to r/t
 components

---
 .../spectral_transformer.py                   | 29 +++++-----
 .../spectral_transform/threading_methods.py   | 58 +++++++++++++++++++
 2 files changed, 74 insertions(+), 13 deletions(-)
 create mode 100644 fbpic/fields/spectral_transform/threading_methods.py

diff --git a/fbpic/fields/spectral_transform/spectral_transformer.py b/fbpic/fields/spectral_transform/spectral_transformer.py
index a9f99222f..af2d2858b 100644
--- a/fbpic/fields/spectral_transform/spectral_transformer.py
+++ b/fbpic/fields/spectral_transform/spectral_transformer.py
@@ -9,6 +9,7 @@
 from .hankel import DHT
 from .fourier import FFT
 
+from .threading_methods import numba_rt_to_pm, numba_pm_to_rt
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m,
                 self.spect_buffer_r, self.spect_buffer_t )
         else :
             # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy)
-            self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \
-                    ( self.spect_buffer_p + self.spect_buffer_m), \
-                1.j*( self.spect_buffer_p - self.spect_buffer_m)
+            # (self.spect_buffer_r and self.spect_buffer_t are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_p, self.spect_buffer_m,
+            # for economy of memory)
+            numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m,
+                            self.spect_buffer_r, self.spect_buffer_t )
 
         # Finally perform the FFT (along axis 0, which corresponds to z)
         self.fft.inverse_transform( self.spect_buffer_r, interp_array_r )
@@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t,
                 self.spect_buffer_r, self.spect_buffer_t,
                 self.spect_buffer_p, self.spect_buffer_m )
         else :
-            # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy.)
-            self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \
-                0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \
-                0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t )
+            # Combine them on the GPU
+            # (self.spect_buffer_p and self.spect_buffer_m are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_r, self.spect_buffer_t,
+            # for economy of memory)
+            numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t,
+                            self.spect_buffer_p, self.spect_buffer_m )
 
         # Perform the inverse DHT (along axis -1, which corresponds to r)
         self.dhtp.transform( self.spect_buffer_p, spect_array_p )
diff --git a/fbpic/fields/spectral_transform/threading_methods.py b/fbpic/fields/spectral_transform/threading_methods.py
new file mode 100644
index 000000000..ce16dbeef
--- /dev/null
+++ b/fbpic/fields/spectral_transform/threading_methods.py
@@ -0,0 +1,58 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines a set of functions that are useful when converting the
+fields from interpolation grid to the spectral grid and vice-versa
+"""
+from fbpic.threading_utils import prange, njit_parallel
+
+# ----------------------------------------------------
+# Functions that combine components in spectral space
+# ----------------------------------------------------
+
+@njit_parallel
+def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
+    """
+    Combine the arrays buffer_r and buffer_t to produce the
+    arrays buffer_p and buffer_m, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_r.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_r = buffer_r[iz, ir]
+            value_t = buffer_t[iz, ir]
+            # Combine the values
+            buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t )
+            buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t )
+
+
+@njit_parallel
+def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) :
+    """
+    Combine the arrays buffer_p and buffer_m to produce the
+    arrays buffer_r and buffer_t, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_p.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_p = buffer_p[iz, ir]
+            value_m = buffer_m[iz, ir]
+            # Combine the values
+            buffer_r[iz, ir] =     ( value_p + value_m )
+            buffer_t[iz, ir] = 1.j*( value_p - value_m )

From 5d2c7e5de971be4ffd737b8c7320ec7f63f2e7c4 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 20 Jul 2017 07:54:26 -0700
Subject: [PATCH 27/36] Remove the flag `use_threading` as an input argument

---
 fbpic/main.py                | 24 ++++++++---------------
 fbpic/particles/particles.py | 16 ++++-----------
 fbpic/threading_utils.py     | 38 +++++++++++++++++++++++++++---------
 3 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/fbpic/main.py b/fbpic/main.py
index 068491ffc..2266d14ca 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -12,7 +12,7 @@
 from mpi4py import MPI
 import numba
 # Check if threading is available
-from .threading_utils import threading_installed
+from .threading_utils import threading_enabled
 # Check if CUDA is available, then import CUDA functions
 from .cuda_utils import cuda_installed
 if cuda_installed:
@@ -45,7 +45,7 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax,
                  p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0.,
                  n_order=-1, dens_func=None, filter_currents=True,
                  v_comoving=None, use_galilean=False, initialize_ions=False,
-                 use_cuda=False, use_threading=True, nthreads=None,
+                 use_cuda=False, nthreads=None,
                  n_guard=None, n_damp=30, exchange_period=None,
                  boundaries='periodic', gamma_boost=None,
                  use_all_mpi_ranks=True, particle_shape='linear' ):
@@ -134,12 +134,10 @@ def dens_func( z, r ) ...
 
         use_cuda: bool, optional
             Wether to use CUDA (GPU) acceleration
-        use_threading : bool, optional
-            Wether to use multi-threading on the CPU.
         nthreads: int, optional
-            Number of CPU multi-threading threads used (if use_threading
-            is set). If nthreads is set to None, the number of threads
-            are automatically determined.
+            Number of CPU multi-threading threads used (if threading is
+            enabled) If nthreads is set to None, the number of threads
+            is automatically determined.
 
         n_guard: int, optional
             Number of guard cells to use at the left and right of
@@ -201,11 +199,7 @@ def dens_func( z, r ) ...
             print('*** Performing the simulation on CPU.')
             self.use_cuda = False
         # CPU multi-threading
-        self.use_threading = use_threading
-        if (use_threading) and (threading_installed==False):
-            print('*** Threading not available for the simulation.')
-            print('*** (Please make sure that numba>0.34 is installed)')
-            self.use_threading = False
+        self.use_threading = threading_enabled
         if self.use_threading:
             # Define number of threads used
             if nthreads is not None:
@@ -260,16 +254,14 @@ def dens_func( z, r ) ...
                       zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
                       Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
                       grid_shape=grid_shape, particle_shape=particle_shape,
-                      use_cuda=self.use_cuda,
-                      use_threading=self.use_threading) ]
+                      use_cuda=self.use_cuda ) ]
         if initialize_ions :
             self.ptcl.append(
                 Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin,
                           zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
                           Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
                           grid_shape=grid_shape, particle_shape=particle_shape,
-                          use_cuda=self.use_cuda,
-                          use_threading=self.use_threading) )
+                          use_cuda=self.use_cuda ) )
 
         # Register the number of particles per cell along z, and dt
         # (Necessary for the moving window)
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 136a79b33..8474d2dc2 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -19,8 +19,8 @@
 from .gathering.numba_methods import gather_field_numba
 
 # Check if threading is available, then import threaded functions
-from fbpic.threading_utils import threading_installed
-if threading_installed:
+from fbpic.threading_utils import threading_enabled
+if threading_enabled:
     from .deposition.threading_methods import deposit_rho_prange_linear, \
         deposit_J_prange_linear, deposit_rho_prange_cubic, \
         deposit_J_prange_cubic, sum_reduce_2d_array
@@ -60,7 +60,7 @@ def __init__(self, q, m, n, Npz, zmin, zmax,
                     ux_th=0., uy_th=0., uz_th=0.,
                     dens_func=None, continuous_injection=True,
                     grid_shape=None, particle_shape='linear',
-                    use_cuda=False, use_threading=True) :
+                    use_cuda=False ) :
         """
         Initialize a uniform set of particles
 
@@ -123,9 +123,6 @@ def dens_func( z, r ) ...
 
         use_cuda : bool, optional
             Wether to use the GPU or not.
-
-        use_threading : bool, optional
-            Wether to use multi-threading on the CPU.
         """
         # Register the timestep
         self.dt = dt
@@ -136,12 +133,6 @@ def dens_func( z, r ) ...
             print('*** Cuda not available for the particles.')
             print('*** Performing the particle operations on the CPU.')
             self.use_cuda = False
-        # Define whether or not to use threading
-        self.use_threading = use_threading
-        if (self.use_threading==True) and (threading_installed==False) :
-            print('*** Threading not available for the simulation.')
-            print('*** (Please make sure that numba>0.34 is installed)')
-            self.use_threading = False
 
         # Register the properties of the particles
         # (Necessary for the pusher, and when adding more particles later, )
@@ -234,6 +225,7 @@ def dens_func( z, r ) ...
             # Register boolean that records if the particles are sorted or not
             self.sorted = False
         # Register variables when using multithreading
+        self.use_threading = threading_enabled
         if self.use_threading == True:
             # Register number of threads
             self.nthreads = numba.config.NUMBA_NUM_THREADS
diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py
index 28bf0e28b..e5bb23b0a 100644
--- a/fbpic/threading_utils.py
+++ b/fbpic/threading_utils.py
@@ -5,15 +5,35 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines a set of generic functions for multithreaded CPU execution.
 """
+import os
 from numba import njit
 
-try:
-    # Try to import the threading function prange
-    from numba import prange
-    threading_installed = True
-    njit_parallel = njit( parallel=True )
-except ImportError:
-    # If not replace threading functions by single-thread functions
-    prange = range
-    threading_installed = False
+# By default threading is enabled
+threading_enabled = True
+
+# Check if the environment variable FBPIC_DISABLE_THREADING is set to 1
+# and in that case, disable threading
+if 'FBPIC_DISABLE_THREADING' in os.environ:
+    if int(os.environ['FBPIC_DISABLE_THREADING']) == 1:
+        threading_enabled = False
+
+# If the user request threading (by not setting FBPIC_DISABLE_THREADING)
+# check if it is indeed installed
+if threading_enabled:
+    try:
+        # Try to import the threading function prange
+        from numba import prange
+    except ImportError:
+        threading_enabled = False
+        print('*** Threading not available for the simulation.')
+        print('*** (Please make sure that numba>0.34 is installed)')
+
+# Set the function njit_parallel and prange to the correct object
+if not threading_enabled:
+    # Use regular serial compilation function
     njit_parallel = njit
+    prange = range
+else:
+    # Use the parallel compilation function
+    njit_parallel = njit( parallel=True )
+    from numba import prange

From 62e37c5162cfef84774ae6a36ba854a0f4c63de8 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 20 Jul 2017 08:24:38 -0700
Subject: [PATCH 28/36] Correct pyflakes errors

---
 fbpic/threading_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py
index e5bb23b0a..4a1200605 100644
--- a/fbpic/threading_utils.py
+++ b/fbpic/threading_utils.py
@@ -22,7 +22,7 @@
 if threading_enabled:
     try:
         # Try to import the threading function prange
-        from numba import prange
+        import numba.prange
     except ImportError:
         threading_enabled = False
         print('*** Threading not available for the simulation.')
@@ -36,4 +36,4 @@
 else:
     # Use the parallel compilation function
     njit_parallel = njit( parallel=True )
-    from numba import prange
+    prange = numba.prange

From 076b6686b856cc39ef7837a7dec78814ec4075e7 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 20 Jul 2017 08:19:32 -0700
Subject: [PATCH 29/36] Thread the shifting of the grid in spectral space

---
 fbpic/boundaries/moving_window.py | 127 +++++++++++++-----------------
 1 file changed, 55 insertions(+), 72 deletions(-)

diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py
index 33d2d921e..8080ca769 100644
--- a/fbpic/boundaries/moving_window.py
+++ b/fbpic/boundaries/moving_window.py
@@ -318,86 +318,36 @@ def shift_spect_grid( self, grid, n_move,
         """
         if grid.use_cuda:
             shift = grid.d_field_shift
+            # Get a 2D CUDA grid of the size of the grid
+            tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] )
             # Shift all the fields on the GPU
-            self.shift_spect_field_gpu( grid.Ep, shift, n_move )
-            self.shift_spect_field_gpu( grid.Em, shift, n_move )
-            self.shift_spect_field_gpu( grid.Ez, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bp, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bm, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bz, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move )
             if shift_rho:
-                self.shift_spect_field_gpu( grid.rho_prev, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move )
             if shift_currents:
-                self.shift_spect_field_gpu( grid.Jp, shift, n_move )
-                self.shift_spect_field_gpu( grid.Jm, shift, n_move )
-                self.shift_spect_field_gpu( grid.Jz, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move )
         else:
             shift = grid.field_shift
             # Shift all the fields on the CPU
-            self.shift_spect_field( grid.Ep, shift, n_move )
-            self.shift_spect_field( grid.Em, shift, n_move )
-            self.shift_spect_field( grid.Ez, shift, n_move )
-            self.shift_spect_field( grid.Bp, shift, n_move )
-            self.shift_spect_field( grid.Bm, shift, n_move )
-            self.shift_spect_field( grid.Bz, shift, n_move )
+            shift_spect_array_cpu( grid.Ep, shift, n_move )
+            shift_spect_array_cpu( grid.Em, shift, n_move )
+            shift_spect_array_cpu( grid.Ez, shift, n_move )
+            shift_spect_array_cpu( grid.Bp, shift, n_move )
+            shift_spect_array_cpu( grid.Bm, shift, n_move )
+            shift_spect_array_cpu( grid.Bz, shift, n_move )
             if shift_rho:
-                self.shift_spect_field( grid.rho_prev, shift, n_move )
+                shift_spect_array_cpu( grid.rho_prev, shift, n_move )
             if shift_currents:
-                self.shift_spect_field( grid.Jp, shift, n_move )
-                self.shift_spect_field( grid.Jm, shift, n_move )
-                self.shift_spect_field( grid.Jz, shift, n_move )
-
-    def shift_spect_field( self, field_array, shift_factor, n_move ):
-        """
-        Shift the field 'field_array' by n_move cells.
-        This is done in spectral space and corresponds to multiplying the
-        fields with the factor exp(i*kz_true*dz)**n_move .
-        (Typically n_move is positive, and the fields are shifted backwards)
-
-        Parameters
-        ----------
-        field_array: 2darray of complexs
-            Contains the value of the fields, and is modified by
-            this function
-
-        shift_factor: 1darray of complexs
-            Contains the shift array, that is multiplied to the fields in
-            spectral space to shift them by one cell in spatial space
-            ( exp(i*kz_true*dz) )
-
-        n_move: int
-            The number of cells by which the grid should be shifted
-        """
-        # Multiply with (shift_factor*sign(n_move))**n_move
-        field_array *= ( shift_factor[:, np.newaxis] )**n_move
-
-    def shift_spect_field_gpu( self, field_array, shift_factor, n_move):
-        """
-        Shift the field 'field_array' by n_move cells on the GPU.
-        This is done in spectral space and corresponds to multiplying the
-        fields with the factor exp(i*kz_true*dz)**n_move .
-        (Typically n_move is positive, and the fields are shifted backwards)
-
-        Parameters
-        ----------
-        field_array: 2darray of complexs
-            Contains the value of the fields, and is modified by
-            this function
-
-        shift_factor: 1darray of complexs
-            Contains the shift array, that is multiplied to the fields in
-            spectral space to shift them by one cell in spatial space
-            ( exp(i*kz_true*dz) )
-
-        n_move: int
-            The number of cells by which the grid should be shifted
-        """
-        # Get a 2D CUDA grid of the size of the grid
-        dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d(
-            field_array.shape[0], field_array.shape[1] )
-        # Shift the field array in place
-        shift_spect_array_gpu[dim_grid_2d, dim_block_2d](
-            field_array, shift_factor, n_move)
+                shift_spect_array_cpu( grid.Jp, shift, n_move )
+                shift_spect_array_cpu( grid.Jm, shift, n_move )
+                shift_spect_array_cpu( grid.Jz, shift, n_move )
 
     def shift_interp_grid( self, grid, n_move,
                            shift_rho=True, shift_currents=False ):
@@ -513,6 +463,39 @@ def shift_interp_field_gpu( self, field_array, n_move):
         # Return the new shifted field array
         return( field_array )
 
+@njit_parallel
+def shift_spect_array_cpu( field_array, shift_factor, n_move ):
+    """
+    Shift the field 'field_array' by n_move cells on CPU.
+    This is done in spectral space and corresponds to multiplying the
+    fields with the factor exp(i*kz_true*dz)**n_move .
+
+    Parameters
+    ----------
+    field_array: 2darray of complexs
+        Contains the value of the fields, and is modified by
+        this function
+
+    shift_factor: 1darray of complexs
+        Contains the shift array, that is multiplied to the fields in
+        spectral space to shift them by one cell in spatial space
+        ( exp(i*kz_true*dz) )
+
+    n_move: int
+        The number of cells by which the grid should be shifted
+    """
+    # Get a 2D CUDA grid
+    iz, ir = cuda.grid(2)
+
+    # Only access values that are actually in the array
+    if ir < field_array.shape[1] and iz < field_array.shape[0]:
+        # Calculate the shift factor (raising to the power n_move)
+        power_shift = shift_factor[iz]
+        for i in range(1,n_move):
+            power_shift *= shift_factor[iz]
+        # Shift fields backwards
+        field_array[iz, ir] *= power_shift
+
 if cuda_installed:
 
     @cuda.jit('void(complex128[:,:], complex128[:,:], int32)')

From 0e654a18ba6d9c6a7a1d23157d18fe16288e8bb0 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 20 Jul 2017 08:41:23 -0700
Subject: [PATCH 30/36] Fix the threaded shift function

---
 fbpic/boundaries/moving_window.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py
index 8080ca769..f08775b79 100644
--- a/fbpic/boundaries/moving_window.py
+++ b/fbpic/boundaries/moving_window.py
@@ -9,6 +9,7 @@
 from scipy.constants import c
 from fbpic.particles import Particles
 from fbpic.lpa_utils.boosted_frame import BoostConverter
+from fbpic.threading_utils import njit_parallel, prange
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -484,17 +485,17 @@ def shift_spect_array_cpu( field_array, shift_factor, n_move ):
     n_move: int
         The number of cells by which the grid should be shifted
     """
-    # Get a 2D CUDA grid
-    iz, ir = cuda.grid(2)
+    Nz, Nr = field_array.shape
 
-    # Only access values that are actually in the array
-    if ir < field_array.shape[1] and iz < field_array.shape[0]:
-        # Calculate the shift factor (raising to the power n_move)
+    # Loop over the 2D array (in parallel over z if threading is enabled)
+    for iz in prange( Nz ):
         power_shift = shift_factor[iz]
+        # Calculate the shift factor (raising to the power n_move)
         for i in range(1,n_move):
             power_shift *= shift_factor[iz]
         # Shift fields backwards
-        field_array[iz, ir] *= power_shift
+        for ir in range( Nr ):
+            field_array[iz, ir] *= power_shift
 
 if cuda_installed:
 

From 11ce49d1fd8bf06459977f30346a6f960167574c Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 20 Jul 2017 09:51:50 -0700
Subject: [PATCH 31/36] Remove arguments nthreads

---
 fbpic/main.py | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/fbpic/main.py b/fbpic/main.py
index 2266d14ca..647d9c287 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -45,8 +45,7 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax,
                  p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0.,
                  n_order=-1, dens_func=None, filter_currents=True,
                  v_comoving=None, use_galilean=False, initialize_ions=False,
-                 use_cuda=False, nthreads=None,
-                 n_guard=None, n_damp=30, exchange_period=None,
+                 use_cuda=False, n_guard=None, n_damp=30, exchange_period=None,
                  boundaries='periodic', gamma_boost=None,
                  use_all_mpi_ranks=True, particle_shape='linear' ):
         """
@@ -134,10 +133,6 @@ def dens_func( z, r ) ...
 
         use_cuda: bool, optional
             Wether to use CUDA (GPU) acceleration
-        nthreads: int, optional
-            Number of CPU multi-threading threads used (if threading is
-            enabled) If nthreads is set to None, the number of threads
-            is automatically determined.
 
         n_guard: int, optional
             Number of guard cells to use at the left and right of
@@ -200,15 +195,7 @@ def dens_func( z, r ) ...
             self.use_cuda = False
         # CPU multi-threading
         self.use_threading = threading_enabled
-        if self.use_threading:
-            # Define number of threads used
-            if nthreads is not None:
-                # Automatically take numba preset for number of threads
-                self.nthreads = nthreads
-                numba.config.NUMBA_NUM_THREADS = self.nthreads
-            else:
-                # Set user-defined number of threads
-                self.nthreads = numba.config.NUMBA_NUM_THREADS
+
         # Register the comoving parameters
         self.v_comoving = v_comoving
         self.use_galilean = use_galilean

From 77be6cf95eca587d3593114e9721b6aa5862bc81 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 20 Jul 2017 14:16:41 -0700
Subject: [PATCH 32/36] Modified import structure of the prange function

---
 fbpic/threading_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fbpic/threading_utils.py b/fbpic/threading_utils.py
index 4a1200605..a35d9a8c5 100644
--- a/fbpic/threading_utils.py
+++ b/fbpic/threading_utils.py
@@ -22,7 +22,7 @@
 if threading_enabled:
     try:
         # Try to import the threading function prange
-        import numba.prange
+        from numba import prange as numba_prange
     except ImportError:
         threading_enabled = False
         print('*** Threading not available for the simulation.')
@@ -36,4 +36,4 @@
 else:
     # Use the parallel compilation function
     njit_parallel = njit( parallel=True )
-    prange = numba.prange
+    prange = numba_prange

From 777281990f318a86e49b9273b34d20b6a4f7c2a0 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Fri, 21 Jul 2017 21:09:54 -0700
Subject: [PATCH 33/36] Replace line endings to unix style

---
 .../particles/deposition/threading_methods.py | 3086 ++++++++---------
 .../particles/gathering/threading_methods.py  | 1202 +++----
 2 files changed, 2144 insertions(+), 2144 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 11ecb54c8..3aa895087 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -1,1543 +1,1543 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the deposition methods for rho and J for linear and cubic
-order shapes on the CPU with threading.
-"""
-import numba
-from numba import prange, int64
-import math
-from scipy.constants import c
-
-# -------------------------------
-# Particle shape Factor functions 
-# -------------------------------
-
-# Linear shapes
-@numba.njit
-def z_shape_linear(cell_position, index):
-    iz = int64(math.floor(cell_position))
-    if index == 0:
-        return iz+1.-cell_position
-    if index == 1:
-        return cell_position - iz
-
-@numba.njit
-def r_shape_linear(cell_position, index):
-    flip_factor = 1.
-    ir = int64(math.floor(cell_position))
-    if index == 0:
-        if ir < 0:
-            flip_factor = -1.
-        return flip_factor*(ir+1.-cell_position)
-    if index == 1:
-        return flip_factor*(cell_position - ir)
-
-# Cubic shapes
-@numba.njit
-def z_shape_cubic(cell_position, index):
-    iz = int64(math.floor(cell_position)) - 1
-    if index == 0:
-        return (-1./6.)*((cell_position-iz)-2)**3
-    if index == 1:
-        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
-    if index == 2:
-        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
-    if index == 3:
-        return (-1./6.)*(((iz+3)-cell_position)-2)**3
-
-@numba.njit
-def r_shape_cubic(cell_position, index):
-    flip_factor = 1.
-    ir = int64(math.floor(cell_position)) - 1
-    if index == 0:
-        if ir < 0:
-            flip_factor = -1.
-        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
-    if index == 1:
-        if ir+1 < 0:
-            flip_factor = -1.
-        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
-    if index == 2:
-        if ir+2 < 0:
-            flip_factor = -1.
-        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
-    if index == 3:
-        if ir+3 < 0:
-            flip_factor = -1.
-        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
-
-# -------------------------------
-# Field deposition - linear - rho
-# -------------------------------
-
-@numba.njit(parallel=True)
-def deposit_rho_prange_linear(x, y, z, w,
-                           invdz, zmin, Nz,
-                           invdr, rmin, Nr,
-                           rho_m0_global, rho_m1_global,
-                           nthreads, tx_chunks):
-    """
-    Deposition of the charge density rho using numba prange on the CPU.
-    Iterates over the threads in parallel, while each thread iterates
-    over a batch of particles. Intermediate results for each threads are
-    stored in copies of the global grid. At the end of the parallel loop,
-    the thread-local field arrays are combined (summed) to a global array.
-    (This final reduction is *not* done in this function)
-
-    Calculates the weighted amount of rho that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The rest of the execution is similar to the CUDA equivalent function.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
-        The global helper arrays to store the thread local charge densities
-        on the interpolation grid for mode 0 and 1. 
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    nthreads : int
-        Number of CPU threads used with numba prange
-
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
-    """
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
-        # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0_scal = wj * exptheta_m0
-            # Mode 1
-            R_m1_scal = wj * exptheta_m1
-
-            # Original index of the uppper and lower cell
-            ir_cell = int(math.floor( r_cell ))
-            iz_cell = int(math.floor( z_cell ))
-
-            # Treat the boundary conditions
-            # guard cells in lower r
-            if ir_cell < 0:
-                ir_cell = 0
-            # absorbing in upper r
-            if ir_cell > Nr-1:
-                ir_cell = Nr-1
-            # periodic boundaries in z
-            if iz_cell < 0:
-                iz_cell += Nz
-            if iz_cell > Nz-1:
-                iz_cell -= Nz
-
-            # Boundary Region Shifts
-            ir_flip = int( math.floor(r_cell) )
-
-            # Declare local field array
-            R_m0_00 = 0.
-            R_m0_01 = 0.
-            R_m0_10 = 0.
-            R_m0_11 = 0.
-
-            R_m1_00 = 0. + 0.j
-            R_m1_01 = 0. + 0.j
-            R_m1_10 = 0. + 0.j
-            R_m1_11 = 0. + 0.j
-
-            R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
-            R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
-            R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
-            R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
-
-            if ir_flip == -1:
-                R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
-            else:
-                R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
-                R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
-                R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
-                R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
-
-            # Cell shifts for the simulation boundaries
-            shift_r = 0
-            shift_z = 0
-            if ir_cell+1 > (Nr-1):
-                shift_r = -1
-            if iz_cell+1 > Nz-1:
-                shift_z -= Nz
-
-            # Write ptcl fields to thread-local part of global deposition array
-            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00
-            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00
-
-            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01
-            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01
-
-            rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10
-            rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10
-
-            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
-            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
-
-    return
-
-# -------------------------------
-# Field deposition - linear - J
-# -------------------------------
-
-@numba.njit(parallel=True)
-def deposit_J_prange_linear(x, y, z, w,
-                         ux, uy, uz, inv_gamma,
-                         invdz, zmin, Nz,
-                         invdr, rmin, Nr,
-                         j_r_m0_global, j_r_m1_global,
-                         j_t_m0_global, j_t_m1_global,
-                         j_z_m0_global, j_z_m1_global,
-                         nthreads, tx_chunks):
-    """
-    Deposition of the current density J using numba prange on the CPU.
-    Iterates over the threads in parallel, while each thread iterates
-    over a batch of particles. Intermediate results for each threads are
-    stored in copies of the global grid. At the end of the parallel loop,
-    the thread-local field arrays are combined (summed) to the global array.
-    (This final reduction is *not* done in this function)
-
-    Calculates the weighted amount of J that is deposited to the
-    4 cells surounding the particle based on its shape (linear).
-
-    The rest of the execution is similar to the CUDA equivalent function.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
-        The global helper arrays to store the thread local current component
-        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    nthreads : int
-        Number of CPU threads used with numba prange
-
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
-    """
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
-        # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Get weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
-            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
-            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
-            # Mode 1
-            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
-            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
-            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
-
-            # Original index of the uppper and lower cell
-            ir_cell = int(math.floor( r_cell ))
-            iz_cell = int(math.floor( z_cell ))
-
-            # Treat the boundary conditions
-            # guard cells in lower r
-            if ir_cell < 0:
-                ir_cell = 0
-            # absorbing in upper r
-            if ir_cell > Nr-1:
-                ir_cell = Nr-1
-            # periodic boundaries in z
-            if iz_cell < 0:
-                iz_cell += Nz
-            if iz_cell > Nz-1:
-                iz_cell -= Nz
-
-            # Boundary Region Shifts
-            ir_flip = int( math.floor(r_cell) )
-
-            # Declare local field arrays
-            J_r_m0_00 = 0.
-            J_r_m1_00 = 0. + 0.j
-            J_t_m0_00 = 0.
-            J_t_m1_00 = 0. + 0.j
-            J_z_m0_00 = 0.
-            J_z_m1_00 = 0. + 0.j
-
-            J_r_m0_01 = 0.
-            J_r_m1_01 = 0. + 0.j
-            J_t_m0_01 = 0.
-            J_t_m1_01 = 0. + 0.j
-            J_z_m0_01 = 0.
-            J_z_m1_01 = 0. + 0.j
-
-            J_r_m0_10 = 0.
-            J_r_m1_10 = 0. + 0.j
-            J_t_m0_10 = 0.
-            J_t_m1_10 = 0. + 0.j
-            J_z_m0_10 = 0.
-            J_z_m1_10 = 0. + 0.j
-
-            J_r_m0_11 = 0.
-            J_r_m1_11 = 0. + 0.j
-            J_t_m0_11 = 0.
-            J_t_m1_11 = 0. + 0.j
-            J_z_m0_11 = 0.
-            J_z_m1_11 = 0. + 0.j
-
-            J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
-            J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
-            J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
-            J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
-            J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
-            J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
-            J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
-            J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
-            J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
-            J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
-            J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
-            J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
-
-            # Take into account lower r flips
-            if ir_flip == -1:
-                J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
-            else:
-                J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
-                J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
-                J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
-                J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
-                J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
-                J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
-                J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
-                J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
-                J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
-                J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
-                J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
-                J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
-
-            # Cell shifts for the simulation boundaries
-            shift_r = 0
-            shift_z = 0
-            if (ir_cell+1) > (Nr-1):
-                shift_r = -1
-            if (iz_cell+1) > Nz-1:
-                shift_z -= Nz
-
-            # Write ptcl fields to thread-local part of global deposition array
-            j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00
-            j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00
-
-            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
-            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
-
-            j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
-            j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
-
-            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
-            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
-
-            j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00
-            j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00
-
-            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
-            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
-
-            j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
-            j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
-
-            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
-            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
-
-            j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00
-            j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00
-
-            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
-            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
-
-            j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
-            j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
-
-            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
-            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
-
-    return
-
-
-# -------------------------------
-# Field deposition - cubic - rho
-# -------------------------------
-
-@numba.njit(parallel=True)
-def deposit_rho_prange_cubic(x, y, z, w,
-                          invdz, zmin, Nz,
-                          invdr, rmin, Nr,
-                          rho_m0_global, rho_m1_global,
-                          nthreads, tx_chunks):
-    """
-
-    Deposition of the charge density rho using numba prange on the CPU.
-    Iterates over the threads in parallel, while each thread iterates
-    over a batch of particles. Intermediate results for each threads are
-    stored in copies of the global grid. At the end of the parallel loop,
-    the thread-local field arrays are combined (summed) to the global array.
-    (This final reduction is *not* done in this function)
-
-    Calculates the weighted amount of rho that is deposited to the
-    16 cells surounding the particle based on its shape (cubic).
-
-    The rest of the execution is similar to the CUDA equivalent function.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
-        The global helper arrays to store the thread local charge densities
-        on the interpolation grid for mode 0 and 1. 
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the considered direction
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    nthreads : int
-        Number of CPU threads used with numba prange
-
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
-    """
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
-        # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate rho
-            # --------------------------------------------
-            # Mode 0
-            R_m0_scal = wj * exptheta_m0
-            # Mode 1
-            R_m1_scal = wj * exptheta_m1
-
-            # Original index of the uppper and lower cell
-            ir_cell = int(math.floor( r_cell ))
-            iz_cell = int(math.floor( z_cell ))
-
-            # Treat the boundary conditions
-            # guard cells in lower r
-            if ir_cell < 0:
-                ir_cell = 0
-            # absorbing in upper r
-            if ir_cell > Nr-1:
-                ir_cell = Nr-1
-            # periodic boundaries in z
-            if iz_cell < 0:
-                iz_cell += Nz
-            if iz_cell > Nz-1:
-                iz_cell -= Nz
-
-            # Compute values in local copies and consider boundaries
-            ir_flip = int( math.floor(r_cell) ) - 1
-
-            # Declare the local field value for
-            # all possible deposition directions,
-            # depending on the shape order and per mode.
-            R_m0_00 = 0.
-            R_m1_00 = 0. + 0.j
-
-            R_m0_01 = 0.
-            R_m1_01 = 0. + 0.j
-
-            R_m0_02 = 0.
-            R_m1_02 = 0. + 0.j
-
-            R_m0_03 = 0.
-            R_m1_03 = 0. + 0.j
-
-            R_m0_10 = 0.
-            R_m1_10 = 0. + 0.j
-
-            R_m0_11 = 0.
-            R_m1_11 = 0. + 0.j
-
-            R_m0_12 = 0.
-            R_m1_12 = 0. + 0.j
-
-            R_m0_13 = 0.
-            R_m1_13 = 0. + 0.j
-
-            R_m0_20 = 0.
-            R_m1_20 = 0. + 0.j
-
-            R_m0_21 = 0.
-            R_m1_21 = 0. + 0.j
-
-            R_m0_22 = 0.
-            R_m1_22 = 0. + 0.j
-
-            R_m0_23 = 0.
-            R_m1_23 = 0. + 0.j
-
-            R_m0_30 = 0.
-            R_m1_30 = 0. + 0.j
-
-            R_m0_31 = 0.
-            R_m1_31 = 0. + 0.j
-
-            R_m0_32 = 0.
-            R_m1_32 = 0. + 0.j
-
-            R_m0_33 = 0.
-            R_m1_33 = 0. + 0.j
-
-            if (ir_flip == -2):
-                R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-            if (ir_flip == -1):
-                R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
-            if (ir_flip >= 0):
-                R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
-                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
-                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
-                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
-                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
-                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
-                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
-                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
-
-            # Index Shifting since local copies are centered around
-            # the current cell
-            srl = 0         # shift r lower
-            sru = 0         # shift r upper inner
-            sru2 = 0        # shift r upper outer
-            szl = 0         # shift z lower
-            szu = 0         # shift z upper inner
-            szu2 = 0        # shift z upper outer
-            if (iz_cell-1) < 0:
-                szl += Nz
-            if (iz_cell) == (Nz - 1):
-                szu -= Nz
-                szu2 -= Nz
-            if (iz_cell+1) == (Nz - 1):
-                szu2 -= Nz
-            if (ir_cell) >= (Nr - 1):
-                sru = -1
-                sru2 = -2
-            if (ir_cell+1) == (Nr - 1):
-                sru2 = -1
-            if (ir_cell-1) < 0:
-                srl = 1
-
-            # Write ptcl fields to thread-local part of global deposition array
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
-            rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01
-            rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10
-            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11
-            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
-            rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21
-            rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
-            rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31
-            rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
-
-    return
-
-# -------------------------------
-# Field deposition - cubic - J
-# -------------------------------
-
-@numba.njit(parallel=True)
-def deposit_J_prange_cubic(x, y, z, w,
-                        ux, uy, uz, inv_gamma,
-                        invdz, zmin, Nz,
-                        invdr, rmin, Nr,
-                        j_r_m0_global, j_r_m1_global,
-                        j_t_m0_global, j_t_m1_global,
-                        j_z_m0_global, j_z_m1_global,
-                        nthreads, tx_chunks):
-    """
-    Deposition of the current density J using numba prange on the CPU.
-    Iterates over the threads in parallel, while each thread iterates
-    over a batch of particles. Intermediate results for each threads are
-    stored in copies of the global grid. At the end of the parallel loop,
-    the thread-local field arrays are combined (summed) to the global array.
-    (This final reduction is *not* done in this function)
-
-    Calculates the weighted amount of J that is deposited to the
-    16 cells surounding the particle based on its shape (cubic).
-
-    The rest of the execution is similar to the CUDA equivalent function.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    w : 1d array of floats
-        The weights of the particles
-
-    ux, uy, uz : 1darray of floats (in meters * second^-1)
-        The velocity of the particles
-
-    inv_gamma : 1darray of floats
-        The inverse of the relativistic gamma factor
-
-    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
-        The global helper arrays to store the thread local current component
-        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
-        (is modified by this function)
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box,
-        along the direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    nthreads : int
-        Number of CPU threads used with numba prange
-
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
-    """
-    # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
-        # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
-            # Preliminary arrays for the cylindrical conversion
-            # --------------------------------------------
-            # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
-            # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
-            # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
-            # Weights
-            wj = w[ptcl_idx]
-
-            # Cylindrical conversion
-            rj = math.sqrt(xj**2 + yj**2)
-            # Avoid division by 0.
-            if (rj != 0.):
-                invr = 1./rj
-                cos = xj*invr  # Cosine
-                sin = yj*invr  # Sine
-            else:
-                cos = 1.
-                sin = 0.
-            exptheta_m0 = 1.
-            exptheta_m1 = cos + 1.j*sin
-
-            # Get weights for the deposition
-            # --------------------------------------------
-            # Positions of the particles, in the cell unit
-            r_cell = invdr*(rj - rmin) - 0.5
-            z_cell = invdz*(zj - zmin) - 0.5
-
-            # Calculate the currents
-            # --------------------------------------------
-            # Mode 0
-            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
-            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
-            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
-            # Mode 1
-            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
-            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
-            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
-
-            # Original index of the uppper and lower cell
-            ir_cell = int(math.floor( r_cell ))
-            iz_cell = int(math.floor( z_cell ))
-
-            # Treat the boundary conditions
-            # guard cells in lower r
-            if ir_cell < 0:
-                ir_cell = 0
-            # absorbing in upper r
-            if ir_cell > Nr-1:
-                ir_cell = Nr-1
-            # periodic boundaries in z
-            if iz_cell < 0:
-                iz_cell += Nz
-            if iz_cell > Nz-1:
-                iz_cell -= Nz
-
-            # Compute values in local copies and consider boundaries
-            ir_flip = int64(math.floor(r_cell)) - 1
-
-            # Declare the local field value for
-            # all possible deposition directions,
-            # depending on the shape order and per mode for r,t and z.
-            J_r_m0_00 = 0.
-            J_t_m0_00 = 0.
-            J_z_m0_00 = 0.
-            J_r_m1_00 = 0. + 0.j
-            J_t_m1_00 = 0. + 0.j
-            J_z_m1_00 = 0. + 0.j
-
-            J_r_m0_01 = 0.
-            J_t_m0_01 = 0.
-            J_z_m0_01 = 0.
-            J_r_m1_01 = 0. + 0.j
-            J_t_m1_01 = 0. + 0.j
-            J_z_m1_01 = 0. + 0.j
-
-            J_r_m0_02 = 0.
-            J_t_m0_02 = 0.
-            J_z_m0_02 = 0.
-            J_r_m1_02 = 0. + 0.j
-            J_t_m1_02 = 0. + 0.j
-            J_z_m1_02 = 0. + 0.j
-
-            J_r_m0_03 = 0.
-            J_t_m0_03 = 0.
-            J_z_m0_03 = 0.
-            J_r_m1_03 = 0. + 0.j
-            J_t_m1_03 = 0. + 0.j
-            J_z_m1_03 = 0. + 0.j
-
-            J_r_m0_10 = 0.
-            J_t_m0_10 = 0.
-            J_z_m0_10 = 0.
-            J_r_m1_10 = 0. + 0.j
-            J_t_m1_10 = 0. + 0.j
-            J_z_m1_10 = 0. + 0.j
-
-            J_r_m0_11 = 0.
-            J_t_m0_11 = 0.
-            J_z_m0_11 = 0.
-            J_r_m1_11 = 0. + 0.j
-            J_t_m1_11 = 0. + 0.j
-            J_z_m1_11 = 0. + 0.j
-
-            J_r_m0_12 = 0.
-            J_t_m0_12 = 0.
-            J_z_m0_12 = 0.
-            J_r_m1_12 = 0. + 0.j
-            J_t_m1_12 = 0. + 0.j
-            J_z_m1_12 = 0. + 0.j
-
-            J_r_m0_13 = 0.
-            J_t_m0_13 = 0.
-            J_z_m0_13 = 0.
-            J_r_m1_13 = 0. + 0.j
-            J_t_m1_13 = 0. + 0.j
-            J_z_m1_13 = 0. + 0.j
-
-            J_r_m0_20 = 0.
-            J_t_m0_20 = 0.
-            J_z_m0_20 = 0.
-            J_r_m1_20 = 0. + 0.j
-            J_t_m1_20 = 0. + 0.j
-            J_z_m1_20 = 0. + 0.j
-
-            J_r_m0_21 = 0.
-            J_t_m0_21 = 0.
-            J_z_m0_21 = 0.
-            J_r_m1_21 = 0. + 0.j
-            J_t_m1_21 = 0. + 0.j
-            J_z_m1_21 = 0. + 0.j
-
-            J_r_m0_22 = 0.
-            J_t_m0_22 = 0.
-            J_z_m0_22 = 0.
-            J_r_m1_22 = 0. + 0.j
-            J_t_m1_22 = 0. + 0.j
-            J_z_m1_22 = 0. + 0.j
-
-            J_r_m0_23 = 0.
-            J_t_m0_23 = 0.
-            J_z_m0_23 = 0.
-            J_r_m1_23 = 0. + 0.j
-            J_t_m1_23 = 0. + 0.j
-            J_z_m1_23 = 0. + 0.j
-
-            J_r_m0_30 = 0.
-            J_t_m0_30 = 0.
-            J_z_m0_30 = 0.
-            J_r_m1_30 = 0. + 0.j
-            J_t_m1_30 = 0. + 0.j
-            J_z_m1_30 = 0. + 0.j
-
-            J_r_m0_31 = 0.
-            J_t_m0_31 = 0.
-            J_z_m0_31 = 0.
-            J_r_m1_31 = 0. + 0.j
-            J_t_m1_31 = 0. + 0.j
-            J_z_m1_31 = 0. + 0.j
-
-            J_r_m0_32 = 0.
-            J_t_m0_32 = 0.
-            J_z_m0_32 = 0.
-            J_r_m1_32 = 0. + 0.j
-            J_t_m1_32 = 0. + 0.j
-            J_z_m1_32 = 0. + 0.j
-
-            J_r_m0_33 = 0.
-            J_t_m0_33 = 0.
-            J_z_m0_33 = 0.
-            J_r_m1_33 = 0. + 0.j
-            J_t_m1_33 = 0. + 0.j
-            J_z_m1_33 = 0. + 0.j
-
-            if (ir_flip == -2):
-                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-
-                J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-            if (ir_flip == -1):
-                J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-            if (ir_flip >= 0):
-                J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
-                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
-                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
-                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
-                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
-                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
-                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
-                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
-
-                J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
-                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
-                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
-                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
-                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
-                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
-                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
-                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
-
-                J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
-                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
-                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
-                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
-                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
-                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
-                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
-                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
-
-            # Index Shifting since local copies are centered around
-            # the current cell
-            srl = 0         # shift r lower
-            sru = 0         # shift r upper inner
-            sru2 = 0        # shift r upper outer
-            szl = 0         # shift z lower
-            szu = 0         # shift z upper inner
-            szu2 = 0        # shift z upper outer
-            if (iz_cell-1) < 0:
-                szl += Nz
-            if (iz_cell) == (Nz - 1):
-                szu -= Nz
-                szu2 -= Nz
-            if (iz_cell+1) == (Nz - 1):
-                szu2 -= Nz
-            if (ir_cell) >= (Nr - 1):
-                sru = -1
-                sru2 = -2
-            if (ir_cell+1) == (Nr - 1):
-                sru2 = -1
-            if (ir_cell-1) < 0:
-                srl = 1
-
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
-            j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01
-            j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
-            j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11
-            j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
-            j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21
-            j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
-            j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
-            j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
-
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
-            j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01
-            j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
-            j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11
-            j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
-            j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21
-            j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
-            j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
-            j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
-
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
-            j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01
-            j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
-            j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11
-            j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
-            j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21
-            j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
-            j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
-            j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
-
-    return
-
-# -----------------------------------------------------------------------
-# Parallel reduction of the global arrays for threads into a single array
-# -----------------------------------------------------------------------
-
-@numba.njit( parallel=True )
-def sum_reduce_2d_array( global_array, reduced_array ):
-    """
-    Sum the array `global_array` along its first axis and 
-    add it into `reduced_array`.
-
-    Parameters:
-    -----------
-    global_array: 3darray of complexs
-       Field array whose first dimension corresponds to the 
-       reduction dimension (typically: the number of threads used
-       during the current deposition)
-
-    reduced array: 2darray of complexs
-    """
-    # Extract size of each dimension
-    Nreduce, Nz, Nr = global_array.shape
-
-    # Parallel loop over iz
-    for iz in prange( Nz ):
-        # Loop over the reduction dimension (slow dimension)
-        for it in range( Nreduce ):
-            # Loop over ir (fast dimension)
-            for ir in range( Nr ):
-
-                reduced_array[ iz, ir ] +=  global_array[ it, iz, ir ]
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the deposition methods for rho and J for linear and cubic
+order shapes on the CPU with threading.
+"""
+import numba
+from numba import prange, int64
+import math
+from scipy.constants import c
+
+# -------------------------------
+# Particle shape Factor functions
+# -------------------------------
+
+# Linear shapes
+@numba.njit
+def z_shape_linear(cell_position, index):
+    iz = int64(math.floor(cell_position))
+    if index == 0:
+        return iz+1.-cell_position
+    if index == 1:
+        return cell_position - iz
+
+@numba.njit
+def r_shape_linear(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position))
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(ir+1.-cell_position)
+    if index == 1:
+        return flip_factor*(cell_position - ir)
+
+# Cubic shapes
+@numba.njit
+def z_shape_cubic(cell_position, index):
+    iz = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        return (-1./6.)*((cell_position-iz)-2)**3
+    if index == 1:
+        return (1./6.)*(3*((cell_position-(iz+1))**3)-6*((cell_position-(iz+1))**2)+4)
+    if index == 2:
+        return (1./6.)*(3*(((iz+2)-cell_position)**3)-6*(((iz+2)-cell_position)**2)+4)
+    if index == 3:
+        return (-1./6.)*(((iz+3)-cell_position)-2)**3
+
+@numba.njit
+def r_shape_cubic(cell_position, index):
+    flip_factor = 1.
+    ir = int64(math.floor(cell_position)) - 1
+    if index == 0:
+        if ir < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*((cell_position-ir)-2)**3
+    if index == 1:
+        if ir+1 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*((cell_position-(ir+1))**3)-6*((cell_position-(ir+1))**2)+4)
+    if index == 2:
+        if ir+2 < 0:
+            flip_factor = -1.
+        return flip_factor*(1./6.)*(3*(((ir+2)-cell_position)**3)-6*(((ir+2)-cell_position)**2)+4)
+    if index == 3:
+        if ir+3 < 0:
+            flip_factor = -1.
+        return flip_factor*(-1./6.)*(((ir+3)-cell_position)-2)**3
+
+# -------------------------------
+# Field deposition - linear - rho
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_rho_prange_linear(x, y, z, w,
+                           invdz, zmin, Nz,
+                           invdr, rmin, Nr,
+                           rho_m0_global, rho_m1_global,
+                           nthreads, tx_chunks):
+    """
+    Deposition of the charge density rho using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to a global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of rho that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local charge densities
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_chunks[0]
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Boundary Region Shifts
+            ir_flip = int( math.floor(r_cell) )
+
+            # Declare local field array
+            R_m0_00 = 0.
+            R_m0_01 = 0.
+            R_m0_10 = 0.
+            R_m0_11 = 0.
+
+            R_m1_00 = 0. + 0.j
+            R_m1_01 = 0. + 0.j
+            R_m1_10 = 0. + 0.j
+            R_m1_11 = 0. + 0.j
+
+            R_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m0_scal
+            R_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m0_scal
+            R_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * R_m1_scal
+            R_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            if ir_flip == -1:
+                R_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+            else:
+                R_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m0_scal
+                R_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m0_scal
+                R_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * R_m1_scal
+                R_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * R_m1_scal
+
+            # Cell shifts for the simulation boundaries
+            shift_r = 0
+            shift_z = 0
+            if ir_cell+1 > (Nr-1):
+                shift_r = -1
+            if iz_cell+1 > Nz-1:
+                shift_z -= Nz
+
+            # Write ptcl fields to thread-local part of global deposition array
+            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00
+            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00
+
+            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01
+            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01
+
+            rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10
+            rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10
+
+            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
+            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
+
+    return
+
+# -------------------------------
+# Field deposition - linear - J
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_J_prange_linear(x, y, z, w,
+                         ux, uy, uz, inv_gamma,
+                         invdz, zmin, Nz,
+                         invdr, rmin, Nr,
+                         j_r_m0_global, j_r_m1_global,
+                         j_t_m0_global, j_t_m1_global,
+                         j_z_m0_global, j_z_m1_global,
+                         nthreads, tx_chunks):
+    """
+    Deposition of the current density J using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of J that is deposited to the
+    4 cells surounding the particle based on its shape (linear).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local current component
+        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_chunks[0]
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Boundary Region Shifts
+            ir_flip = int( math.floor(r_cell) )
+
+            # Declare local field arrays
+            J_r_m0_00 = 0.
+            J_r_m1_00 = 0. + 0.j
+            J_t_m0_00 = 0.
+            J_t_m1_00 = 0. + 0.j
+            J_z_m0_00 = 0.
+            J_z_m1_00 = 0. + 0.j
+
+            J_r_m0_01 = 0.
+            J_r_m1_01 = 0. + 0.j
+            J_t_m0_01 = 0.
+            J_t_m1_01 = 0. + 0.j
+            J_z_m0_01 = 0.
+            J_z_m1_01 = 0. + 0.j
+
+            J_r_m0_10 = 0.
+            J_r_m1_10 = 0. + 0.j
+            J_t_m0_10 = 0.
+            J_t_m1_10 = 0. + 0.j
+            J_z_m0_10 = 0.
+            J_z_m1_10 = 0. + 0.j
+
+            J_r_m0_11 = 0.
+            J_r_m1_11 = 0. + 0.j
+            J_t_m0_11 = 0.
+            J_t_m1_11 = 0. + 0.j
+            J_z_m0_11 = 0.
+            J_z_m1_11 = 0. + 0.j
+
+            J_r_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+            J_t_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+            J_z_m0_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+            J_r_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+            J_t_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+            J_z_m0_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+            J_r_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+            J_t_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+            J_z_m1_00 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+            J_r_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+            J_t_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+            J_z_m1_01 += r_shape_linear(r_cell, 0)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Take into account lower r flips
+            if ir_flip == -1:
+                J_r_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_00 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_01 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+            else:
+                J_r_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m0_scal
+                J_t_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m0_scal
+                J_z_m0_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m0_scal
+                J_r_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m0_scal
+                J_t_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m0_scal
+                J_z_m0_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m0_scal
+                J_r_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_r_m1_scal
+                J_t_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_t_m1_scal
+                J_z_m1_10 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 0) * J_z_m1_scal
+                J_r_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_r_m1_scal
+                J_t_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_t_m1_scal
+                J_z_m1_11 += r_shape_linear(r_cell, 1)*z_shape_linear(z_cell, 1) * J_z_m1_scal
+
+            # Cell shifts for the simulation boundaries
+            shift_r = 0
+            shift_z = 0
+            if (ir_cell+1) > (Nr-1):
+                shift_r = -1
+            if (iz_cell+1) > Nz-1:
+                shift_z -= Nz
+
+            # Write ptcl fields to thread-local part of global deposition array
+            j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00
+            j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00
+
+            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
+            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
+
+            j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
+            j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
+
+            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
+            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
+
+            j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00
+            j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00
+
+            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
+            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
+
+            j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
+            j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
+
+            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
+            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
+
+            j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00
+            j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00
+
+            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
+            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
+
+            j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
+            j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
+
+            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
+            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
+
+    return
+
+
+# -------------------------------
+# Field deposition - cubic - rho
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_rho_prange_cubic(x, y, z, w,
+                          invdz, zmin, Nz,
+                          invdr, rmin, Nr,
+                          rho_m0_global, rho_m1_global,
+                          nthreads, tx_chunks):
+    """
+
+    Deposition of the charge density rho using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of rho that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    rho_m0_global, rho_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local charge densities
+        on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the considered direction
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_chunks[0]
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate rho
+            # --------------------------------------------
+            # Mode 0
+            R_m0_scal = wj * exptheta_m0
+            # Mode 1
+            R_m1_scal = wj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Compute values in local copies and consider boundaries
+            ir_flip = int( math.floor(r_cell) ) - 1
+
+            # Declare the local field value for
+            # all possible deposition directions,
+            # depending on the shape order and per mode.
+            R_m0_00 = 0.
+            R_m1_00 = 0. + 0.j
+
+            R_m0_01 = 0.
+            R_m1_01 = 0. + 0.j
+
+            R_m0_02 = 0.
+            R_m1_02 = 0. + 0.j
+
+            R_m0_03 = 0.
+            R_m1_03 = 0. + 0.j
+
+            R_m0_10 = 0.
+            R_m1_10 = 0. + 0.j
+
+            R_m0_11 = 0.
+            R_m1_11 = 0. + 0.j
+
+            R_m0_12 = 0.
+            R_m1_12 = 0. + 0.j
+
+            R_m0_13 = 0.
+            R_m1_13 = 0. + 0.j
+
+            R_m0_20 = 0.
+            R_m1_20 = 0. + 0.j
+
+            R_m0_21 = 0.
+            R_m1_21 = 0. + 0.j
+
+            R_m0_22 = 0.
+            R_m1_22 = 0. + 0.j
+
+            R_m0_23 = 0.
+            R_m1_23 = 0. + 0.j
+
+            R_m0_30 = 0.
+            R_m1_30 = 0. + 0.j
+
+            R_m0_31 = 0.
+            R_m1_31 = 0. + 0.j
+
+            R_m0_32 = 0.
+            R_m1_32 = 0. + 0.j
+
+            R_m0_33 = 0.
+            R_m1_33 = 0. + 0.j
+
+            if (ir_flip == -2):
+                R_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            if (ir_flip == -1):
+                R_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+            if (ir_flip >= 0):
+                R_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+                R_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m0_scal
+                R_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*R_m1_scal
+                R_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m0_scal
+                R_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*R_m1_scal
+                R_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m0_scal
+                R_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*R_m1_scal
+                R_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m0_scal
+                R_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*R_m1_scal
+
+            # Index Shifting since local copies are centered around
+            # the current cell
+            srl = 0         # shift r lower
+            sru = 0         # shift r upper inner
+            sru2 = 0        # shift r upper outer
+            szl = 0         # shift z lower
+            szu = 0         # shift z upper inner
+            szu2 = 0        # shift z upper outer
+            if (iz_cell-1) < 0:
+                szl += Nz
+            if (iz_cell) == (Nz - 1):
+                szu -= Nz
+                szu2 -= Nz
+            if (iz_cell+1) == (Nz - 1):
+                szu2 -= Nz
+            if (ir_cell) >= (Nr - 1):
+                sru = -1
+                sru2 = -2
+            if (ir_cell+1) == (Nr - 1):
+                sru2 = -1
+            if (ir_cell-1) < 0:
+                srl = 1
+
+            # Write ptcl fields to thread-local part of global deposition array
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
+            rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01
+            rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10
+            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11
+            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
+            rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21
+            rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
+            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
+            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
+            rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31
+            rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31
+            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
+            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
+            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
+            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
+
+    return
+
+# -------------------------------
+# Field deposition - cubic - J
+# -------------------------------
+
+@numba.njit(parallel=True)
+def deposit_J_prange_cubic(x, y, z, w,
+                        ux, uy, uz, inv_gamma,
+                        invdz, zmin, Nz,
+                        invdr, rmin, Nr,
+                        j_r_m0_global, j_r_m1_global,
+                        j_t_m0_global, j_t_m1_global,
+                        j_z_m0_global, j_z_m1_global,
+                        nthreads, tx_chunks):
+    """
+    Deposition of the current density J using numba prange on the CPU.
+    Iterates over the threads in parallel, while each thread iterates
+    over a batch of particles. Intermediate results for each threads are
+    stored in copies of the global grid. At the end of the parallel loop,
+    the thread-local field arrays are combined (summed) to the global array.
+    (This final reduction is *not* done in this function)
+
+    Calculates the weighted amount of J that is deposited to the
+    16 cells surounding the particle based on its shape (cubic).
+
+    The rest of the execution is similar to the CUDA equivalent function.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    w : 1d array of floats
+        The weights of the particles
+
+    ux, uy, uz : 1darray of floats (in meters * second^-1)
+        The velocity of the particles
+
+    inv_gamma : 1darray of floats
+        The inverse of the relativistic gamma factor
+
+    j_x_m0_global, j_x_m1_global : 3darrays of complexs (nthread, Nz, Nr)
+        The global helper arrays to store the thread local current component
+        in each direction (r, t, z) on the interpolation grid for mode 0 and 1.
+        (is modified by this function)
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box,
+        along the direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    nthreads : int
+        Number of CPU threads used with numba prange
+
+    tx_chunks : list of int
+        Exact particle batch size per thread.
+        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    """
+    # Deposit the field per cell in parallel (for threads < number of cells)
+    for tx in prange( nthreads ):
+        # Loop over all particles in thread chunk
+        for idx in range( tx_chunks[tx] ):
+            # Calculate thread local particle index
+            ptcl_idx = idx + tx*tx_chunks[0]
+            # Preliminary arrays for the cylindrical conversion
+            # --------------------------------------------
+            # Position
+            xj = x[ptcl_idx]
+            yj = y[ptcl_idx]
+            zj = z[ptcl_idx]
+            # Velocity
+            uxj = ux[ptcl_idx]
+            uyj = uy[ptcl_idx]
+            uzj = uz[ptcl_idx]
+            # Inverse gamma
+            inv_gammaj = inv_gamma[ptcl_idx]
+            # Weights
+            wj = w[ptcl_idx]
+
+            # Cylindrical conversion
+            rj = math.sqrt(xj**2 + yj**2)
+            # Avoid division by 0.
+            if (rj != 0.):
+                invr = 1./rj
+                cos = xj*invr  # Cosine
+                sin = yj*invr  # Sine
+            else:
+                cos = 1.
+                sin = 0.
+            exptheta_m0 = 1.
+            exptheta_m1 = cos + 1.j*sin
+
+            # Get weights for the deposition
+            # --------------------------------------------
+            # Positions of the particles, in the cell unit
+            r_cell = invdr*(rj - rmin) - 0.5
+            z_cell = invdz*(zj - zmin) - 0.5
+
+            # Calculate the currents
+            # --------------------------------------------
+            # Mode 0
+            J_r_m0_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m0
+            J_t_m0_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m0
+            J_z_m0_scal = wj * c * inv_gammaj*uzj * exptheta_m0
+            # Mode 1
+            J_r_m1_scal = wj * c * inv_gammaj*(cos*uxj + sin*uyj) * exptheta_m1
+            J_t_m1_scal = wj * c * inv_gammaj*(cos*uyj - sin*uxj) * exptheta_m1
+            J_z_m1_scal = wj * c * inv_gammaj*uzj * exptheta_m1
+
+            # Original index of the uppper and lower cell
+            ir_cell = int(math.floor( r_cell ))
+            iz_cell = int(math.floor( z_cell ))
+
+            # Treat the boundary conditions
+            # guard cells in lower r
+            if ir_cell < 0:
+                ir_cell = 0
+            # absorbing in upper r
+            if ir_cell > Nr-1:
+                ir_cell = Nr-1
+            # periodic boundaries in z
+            if iz_cell < 0:
+                iz_cell += Nz
+            if iz_cell > Nz-1:
+                iz_cell -= Nz
+
+            # Compute values in local copies and consider boundaries
+            ir_flip = int64(math.floor(r_cell)) - 1
+
+            # Declare the local field value for
+            # all possible deposition directions,
+            # depending on the shape order and per mode for r,t and z.
+            J_r_m0_00 = 0.
+            J_t_m0_00 = 0.
+            J_z_m0_00 = 0.
+            J_r_m1_00 = 0. + 0.j
+            J_t_m1_00 = 0. + 0.j
+            J_z_m1_00 = 0. + 0.j
+
+            J_r_m0_01 = 0.
+            J_t_m0_01 = 0.
+            J_z_m0_01 = 0.
+            J_r_m1_01 = 0. + 0.j
+            J_t_m1_01 = 0. + 0.j
+            J_z_m1_01 = 0. + 0.j
+
+            J_r_m0_02 = 0.
+            J_t_m0_02 = 0.
+            J_z_m0_02 = 0.
+            J_r_m1_02 = 0. + 0.j
+            J_t_m1_02 = 0. + 0.j
+            J_z_m1_02 = 0. + 0.j
+
+            J_r_m0_03 = 0.
+            J_t_m0_03 = 0.
+            J_z_m0_03 = 0.
+            J_r_m1_03 = 0. + 0.j
+            J_t_m1_03 = 0. + 0.j
+            J_z_m1_03 = 0. + 0.j
+
+            J_r_m0_10 = 0.
+            J_t_m0_10 = 0.
+            J_z_m0_10 = 0.
+            J_r_m1_10 = 0. + 0.j
+            J_t_m1_10 = 0. + 0.j
+            J_z_m1_10 = 0. + 0.j
+
+            J_r_m0_11 = 0.
+            J_t_m0_11 = 0.
+            J_z_m0_11 = 0.
+            J_r_m1_11 = 0. + 0.j
+            J_t_m1_11 = 0. + 0.j
+            J_z_m1_11 = 0. + 0.j
+
+            J_r_m0_12 = 0.
+            J_t_m0_12 = 0.
+            J_z_m0_12 = 0.
+            J_r_m1_12 = 0. + 0.j
+            J_t_m1_12 = 0. + 0.j
+            J_z_m1_12 = 0. + 0.j
+
+            J_r_m0_13 = 0.
+            J_t_m0_13 = 0.
+            J_z_m0_13 = 0.
+            J_r_m1_13 = 0. + 0.j
+            J_t_m1_13 = 0. + 0.j
+            J_z_m1_13 = 0. + 0.j
+
+            J_r_m0_20 = 0.
+            J_t_m0_20 = 0.
+            J_z_m0_20 = 0.
+            J_r_m1_20 = 0. + 0.j
+            J_t_m1_20 = 0. + 0.j
+            J_z_m1_20 = 0. + 0.j
+
+            J_r_m0_21 = 0.
+            J_t_m0_21 = 0.
+            J_z_m0_21 = 0.
+            J_r_m1_21 = 0. + 0.j
+            J_t_m1_21 = 0. + 0.j
+            J_z_m1_21 = 0. + 0.j
+
+            J_r_m0_22 = 0.
+            J_t_m0_22 = 0.
+            J_z_m0_22 = 0.
+            J_r_m1_22 = 0. + 0.j
+            J_t_m1_22 = 0. + 0.j
+            J_z_m1_22 = 0. + 0.j
+
+            J_r_m0_23 = 0.
+            J_t_m0_23 = 0.
+            J_z_m0_23 = 0.
+            J_r_m1_23 = 0. + 0.j
+            J_t_m1_23 = 0. + 0.j
+            J_z_m1_23 = 0. + 0.j
+
+            J_r_m0_30 = 0.
+            J_t_m0_30 = 0.
+            J_z_m0_30 = 0.
+            J_r_m1_30 = 0. + 0.j
+            J_t_m1_30 = 0. + 0.j
+            J_z_m1_30 = 0. + 0.j
+
+            J_r_m0_31 = 0.
+            J_t_m0_31 = 0.
+            J_z_m0_31 = 0.
+            J_r_m1_31 = 0. + 0.j
+            J_t_m1_31 = 0. + 0.j
+            J_z_m1_31 = 0. + 0.j
+
+            J_r_m0_32 = 0.
+            J_t_m0_32 = 0.
+            J_z_m0_32 = 0.
+            J_r_m1_32 = 0. + 0.j
+            J_t_m1_32 = 0. + 0.j
+            J_z_m1_32 = 0. + 0.j
+
+            J_r_m0_33 = 0.
+            J_t_m0_33 = 0.
+            J_z_m0_33 = 0.
+            J_r_m1_33 = 0. + 0.j
+            J_t_m1_33 = 0. + 0.j
+            J_z_m1_33 = 0. + 0.j
+
+            if (ir_flip == -2):
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+                J_r_m0_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+
+                J_t_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m1_20 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir_flip == -1):
+                J_r_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+            if (ir_flip >= 0):
+                J_r_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_r_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m0_scal
+                J_r_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_r_m1_scal
+                J_r_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m0_scal
+                J_r_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_r_m1_scal
+                J_r_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m0_scal
+                J_r_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_r_m1_scal
+                J_r_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m0_scal
+                J_r_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_r_m1_scal
+
+                J_t_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_t_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m0_scal
+                J_t_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_t_m1_scal
+                J_t_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m0_scal
+                J_t_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_t_m1_scal
+                J_t_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m0_scal
+                J_t_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_t_m1_scal
+                J_t_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m0_scal
+                J_t_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_t_m1_scal
+
+                J_z_m0_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_00 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_01 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_02 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_03 += r_shape_cubic(r_cell, 0)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_10 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_11 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_12 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_13 += r_shape_cubic(r_cell, 1)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_20 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_21 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_22 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_23 += r_shape_cubic(r_cell, 2)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+                J_z_m0_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m0_scal
+                J_z_m1_30 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 0)*J_z_m1_scal
+                J_z_m0_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m0_scal
+                J_z_m1_31 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 1)*J_z_m1_scal
+                J_z_m0_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m0_scal
+                J_z_m1_32 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 2)*J_z_m1_scal
+                J_z_m0_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m0_scal
+                J_z_m1_33 += r_shape_cubic(r_cell, 3)*z_shape_cubic(z_cell, 3)*J_z_m1_scal
+
+            # Index Shifting since local copies are centered around
+            # the current cell
+            srl = 0         # shift r lower
+            sru = 0         # shift r upper inner
+            sru2 = 0        # shift r upper outer
+            szl = 0         # shift z lower
+            szu = 0         # shift z upper inner
+            szu2 = 0        # shift z upper outer
+            if (iz_cell-1) < 0:
+                szl += Nz
+            if (iz_cell) == (Nz - 1):
+                szu -= Nz
+                szu2 -= Nz
+            if (iz_cell+1) == (Nz - 1):
+                szu2 -= Nz
+            if (ir_cell) >= (Nr - 1):
+                sru = -1
+                sru2 = -2
+            if (ir_cell+1) == (Nr - 1):
+                sru2 = -1
+            if (ir_cell-1) < 0:
+                srl = 1
+
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
+            j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01
+            j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
+            j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11
+            j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
+            j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21
+            j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
+            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
+            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
+            j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
+            j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
+            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
+            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
+            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
+            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
+
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
+            j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01
+            j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
+            j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11
+            j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
+            j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21
+            j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
+            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
+            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
+            j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
+            j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
+            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
+            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
+            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
+            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
+
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
+            j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01
+            j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
+            j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11
+            j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
+            j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21
+            j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
+            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
+            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
+            j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
+            j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
+            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
+            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
+            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
+            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
+
+    return
+
+# -----------------------------------------------------------------------
+# Parallel reduction of the global arrays for threads into a single array
+# -----------------------------------------------------------------------
+
+@numba.njit( parallel=True )
+def sum_reduce_2d_array( global_array, reduced_array ):
+    """
+    Sum the array `global_array` along its first axis and
+    add it into `reduced_array`.
+
+    Parameters:
+    -----------
+    global_array: 3darray of complexs
+       Field array whose first dimension corresponds to the
+       reduction dimension (typically: the number of threads used
+       during the current deposition)
+
+    reduced array: 2darray of complexs
+    """
+    # Extract size of each dimension
+    Nreduce, Nz, Nr = global_array.shape
+
+    # Parallel loop over iz
+    for iz in prange( Nz ):
+        # Loop over the reduction dimension (slow dimension)
+        for it in range( Nreduce ):
+            # Loop over ir (fast dimension)
+            for ir in range( Nr ):
+
+                reduced_array[ iz, ir ] +=  global_array[ it, iz, ir ]
diff --git a/fbpic/particles/gathering/threading_methods.py b/fbpic/particles/gathering/threading_methods.py
index 2c996a5ff..9809fcce5 100644
--- a/fbpic/particles/gathering/threading_methods.py
+++ b/fbpic/particles/gathering/threading_methods.py
@@ -1,601 +1,601 @@
-# Copyright 2016, FBPIC contributors
-# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
-# License: 3-Clause-BSD-LBNL
-"""
-This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
-It defines the field gathering methods linear and cubic order shapes 
-on the CPU with threading
-"""
-from numba import prange, int64
-import numba
-import math
-import numpy as np
-
-# -----------------------
-# Field gathering linear
-# -----------------------
-
-@numba.njit(parallel=True)
-def gather_field_prange_linear(x, y, z,
-                    invdz, zmin, Nz,
-                    invdr, rmin, Nr,
-                    Er_m0, Et_m0, Ez_m0,
-                    Er_m1, Et_m1, Ez_m1,
-                    Br_m0, Bt_m0, Bz_m0,
-                    Br_m1, Bt_m1, Bz_m1,
-                    Ex, Ey, Ez,
-                    Bx, By, Bz):
-    """
-    Gathering of the fields (E and B) using numba with multi-threading.
-    Iterates over the particles, calculates the weighted amount
-    of fields acting on each particle based on its shape (linear).
-    Fields are gathered in cylindrical coordinates and then
-    transformed to cartesian coordinates.
-    Supports only mode 0 and 1.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box along the
-        direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 0
-
-    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 1
-
-    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 0
-
-    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 1
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-        (is modified by this function)
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-        (is modified by this function)
-    """
-    # Deposit the field per cell in parallel
-    for i in prange(x.shape[0]):
-        # Preliminary arrays for the cylindrical conversion
-        # --------------------------------------------
-        # Position
-        xj = x[i]
-        yj = y[i]
-        zj = z[i]
-
-        # Cylindrical conversion
-        rj = math.sqrt( xj**2 + yj**2 )
-        if (rj !=0. ) :
-            invr = 1./rj
-            cos = xj*invr  # Cosine
-            sin = yj*invr  # Sine
-        else :
-            cos = 1.
-            sin = 0.
-        exptheta_m0 = 1.
-        exptheta_m1 = cos - 1.j*sin
-
-        # Get linear weights for the deposition
-        # --------------------------------------------
-        # Positions of the particles, in the cell unit
-        r_cell =  invdr*(rj - rmin) - 0.5
-        z_cell =  invdz*(zj - zmin) - 0.5
-        # Original index of the uppper and lower cell
-        ir_lower = int(math.floor( r_cell ))
-        ir_upper = ir_lower + 1
-        iz_lower = int(math.floor( z_cell ))
-        iz_upper = iz_lower + 1
-        # Linear weight
-        Sr_lower = ir_upper - r_cell
-        Sr_upper = r_cell - ir_lower
-        Sz_lower = iz_upper - z_cell
-        Sz_upper = z_cell - iz_lower
-        # Set guard weights to zero
-        Sr_guard = 0.
-
-        # Treat the boundary conditions
-        # --------------------------------------------
-        # guard cells in lower r
-        if ir_lower < 0:
-            Sr_guard = Sr_lower
-            Sr_lower = 0.
-            ir_lower = 0
-        # absorbing in upper r
-        if ir_lower > Nr-1:
-            ir_lower = Nr-1
-        if ir_upper > Nr-1:
-            ir_upper = Nr-1
-        # periodic boundaries in z
-        # lower z boundaries
-        if iz_lower < 0:
-            iz_lower += Nz
-        if iz_upper < 0:
-            iz_upper += Nz
-        # upper z boundaries
-        if iz_lower > Nz-1:
-            iz_lower -= Nz
-        if iz_upper > Nz-1:
-            iz_upper -= Nz
-
-        #Precalculate Shapes
-        S_ll = Sz_lower*Sr_lower
-        S_lu = Sz_lower*Sr_upper
-        S_ul = Sz_upper*Sr_lower
-        S_uu = Sz_upper*Sr_upper
-        S_lg = Sz_lower*Sr_guard
-        S_ug = Sz_upper*Sr_guard
-
-        # E-Field
-        # ----------------------------
-        # Define the initial placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ]
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0]
-            Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0]
-            Fz_m +=  1. * S_lg * Ez_m0[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0]
-            Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0]
-            Fz_m +=  1. * S_ug * Ez_m0[ iz_upper, 0]
-        # Add the fields from the mode 0
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 1
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ]
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m +=  1. * S_lg * Er_m1[ iz_lower, 0]
-            Ft_m +=  1. * S_lg * Et_m1[ iz_lower, 0]
-            Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m +=  1. * S_ug * Er_m1[ iz_upper, 0]
-            Ft_m +=  1. * S_ug * Et_m1[ iz_upper, 0]
-            Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0]
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Ex[i] = cos*Fr - sin*Ft
-        Ey[i] = sin*Fr + cos*Ft
-        Ez[i] = Fz
-
-        # B-Field
-        # ----------------------------
-        # Clear the placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ]
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0]
-            Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0]
-            Fz_m +=  1. * S_lg * Bz_m0[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0]
-            Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0]
-            Fz_m +=  1. * S_ug * Bz_m0[ iz_upper, 0]
-        # Add the fields from the mode 0
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 1
-        # Lower cell in z, Lower cell in r
-        Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ]
-        Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ]
-        Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ]
-        # Lower cell in z, Upper cell in r
-        Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ]
-        Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ]
-        Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ]
-        # Upper cell in z, Lower cell in r
-        Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ]
-        Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ]
-        Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ]
-        # Upper cell in z, Upper cell in r
-        Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ]
-        Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ]
-        Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ]
-
-        # Add the fields from the guard cells
-        if ir_lower == ir_upper == 0:
-            # Lower cell in z
-            Fr_m +=  1. * S_lg * Br_m1[ iz_lower, 0]
-            Ft_m +=  1. * S_lg * Bt_m1[ iz_lower, 0]
-            Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0]
-            # Upper cell in z
-            Fr_m +=  1. * S_ug * Br_m1[ iz_upper, 0]
-            Ft_m +=  1. * S_ug * Bt_m1[ iz_upper, 0]
-            Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0]
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Bx[i] = cos*Fr - sin*Ft
-        By[i] = sin*Fr + cos*Ft
-        Bz[i] = Fz
-
-    return Ex, Ey, Ez, Bx, By, Bz
-
-# -----------------------
-# Field gathering cubic
-# -----------------------
-
-@numba.njit(parallel=True)
-def gather_field_prange_cubic(x, y, z,
-                    invdz, zmin, Nz,
-                    invdr, rmin, Nr,
-                    Er_m0, Et_m0, Ez_m0,
-                    Er_m1, Et_m1, Ez_m1,
-                    Br_m0, Bt_m0, Bz_m0,
-                    Br_m1, Bt_m1, Bz_m1,
-                    Ex, Ey, Ez,
-                    Bx, By, Bz):
-    """
-    Gathering of the fields (E and B) using numba with multi-threading.
-    Iterates over the particles, calculates the weighted amount
-    of fields acting on each particle based on its shape (cubic).
-    Fields are gathered in cylindrical coordinates and then
-    transformed to cartesian coordinates.
-    Supports only mode 0 and 1.
-
-    Parameters
-    ----------
-    x, y, z : 1darray of floats (in meters)
-        The position of the particles
-
-    invdz, invdr : float (in meters^-1)
-        Inverse of the grid step along the considered direction
-
-    zmin, rmin : float (in meters)
-        Position of the edge of the simulation box along the
-        direction considered
-
-    Nz, Nr : int
-        Number of gridpoints along the considered direction
-
-    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 0
-
-    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
-        The electric fields on the interpolation grid for the mode 1
-
-    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 0
-
-    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
-        The magnetic fields on the interpolation grid for the mode 1
-
-    Ex, Ey, Ez : 1darray of floats
-        The electric fields acting on the particles
-        (is modified by this function)
-
-    Bx, By, Bz : 1darray of floats
-        The magnetic fields acting on the particles
-        (is modified by this function)
-    """
-    # Deposit the field per cell in parallel
-    # (for threads < number of particles)
-    for i in prange(x.shape[0]):
-        # Preliminary arrays for the cylindrical conversion
-        # --------------------------------------------
-        # Position
-        xj = x[i]
-        yj = y[i]
-        zj = z[i]
-
-        # Cylindrical conversion
-        rj = math.sqrt(xj**2 + yj**2)
-        if (rj != 0.):
-            invr = 1./rj
-            cos = xj*invr  # Cosine
-            sin = yj*invr  # Sine
-        else:
-            cos = 1.
-            sin = 0.
-        exptheta_m0 = 1.
-        exptheta_m1 = cos - 1.j*sin
-
-        # Get weights for the deposition
-        # --------------------------------------------
-        # Positions of the particle, in the cell unit
-        r_cell = invdr*(rj - rmin) - 0.5
-        z_cell = invdz*(zj - zmin) - 0.5
-
-        # Calculate the shape factors
-        ir = np.zeros(4, dtype=int64) #[0,0,0,0]
-        Sr = np.zeros(4) #[0.,0.,0.,0.]
-        ir[0] = int64(math.floor(r_cell)) - 1
-        ir[1] = ir[0] + 1
-        ir[2] = ir[1] + 1
-        ir[3] = ir[2] + 1
-        Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3
-        Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
-        Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
-        Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
-        iz = np.zeros(4, dtype=int64) #[0,0,0,0]
-        Sz = np.zeros(4) #[0.,0.,0.,0.]
-        iz[0] = int64(math.floor(z_cell)) - 1
-        iz[1] = iz[0] + 1
-        iz[2] = iz[1] + 1
-        iz[3] = iz[2] + 1
-        Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3
-        Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4)
-        Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
-        Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
-        # Lower and upper periodic boundary for z
-        index_z = 0
-        while index_z < 4:
-            if iz[index_z] < 0:
-                iz[index_z] += Nz
-            if iz[index_z] > Nz - 1:
-                iz[index_z] -= Nz
-            index_z += 1
-        # Lower and upper boundary for r
-        index_r = 0
-        while index_r < 4:
-            if ir[index_r] < 0:
-                ir[index_r] = abs(ir[index_r])-1
-                Sr[index_r] = (-1.)*Sr[index_r]
-            if ir[index_r] > Nr - 1:
-                ir[index_r] = Nr - 1
-            index_r += 1
-
-        # E-Field
-        # ----------------------------
-        # Define the initial placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        index_r = 0
-        while index_r < 4:
-            index_z = 0
-            while index_z < 4:
-                Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
-                Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Ez_m0[iz[index_z], ir[index_r]]
-                else:
-                    Fz_m += Sz[index_z]*Sr[index_r]* \
-                        Ez_m0[iz[index_z], ir[index_r]]
-                index_z += 1
-            index_r += 1
-
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 1
-        index_r = 0
-        while index_r < 4:
-            index_z = 0
-            while index_z < 4:
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                                Er_m1[iz[index_z], ir[index_r]]
-                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                                Et_m1[iz[index_z], ir[index_r]]
-                else:
-                    Fr_m += Sz[index_z]*Sr[index_r]* \
-                                Er_m1[iz[index_z], ir[index_r]]
-                    Ft_m += Sz[index_z]*Sr[index_r]* \
-                                Et_m1[iz[index_z], ir[index_r]]
-                Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
-                index_z += 1
-            index_r += 1
-
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Ex[i] = (cos*Fr - sin*Ft)
-        Ey[i] = (sin*Fr + cos*Ft)
-        Ez[i] = Fz
-
-        # B-Field
-        # ----------------------------
-        # Clear the placeholders for the
-        # gathered field for each coordinate
-        Fr = 0.
-        Ft = 0.
-        Fz = 0.
-
-        # Mode 0
-        # ----------------------------
-        # Create temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-        # Add the fields for mode 0
-        index_r = 0
-        while index_r < 4:
-            index_z = 0
-            while index_z < 4:
-                Fr_m += Sz[index_z]*Sr[index_r]* \
-                    Br_m0[iz[index_z], ir[index_r]]
-                Ft_m += Sz[index_z]*Sr[index_r]* \
-                    Bt_m0[iz[index_z], ir[index_r]]
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Bz_m0[iz[index_z], ir[index_r]]
-                else:
-                    Fz_m += Sz[index_z]*Sr[index_r]* \
-                        Bz_m0[iz[index_z], ir[index_r]]
-                index_z += 1
-            index_r += 1
-
-        # Add the fields from the mode 0
-        Fr += (Fr_m*exptheta_m0).real
-        Ft += (Ft_m*exptheta_m0).real
-        Fz += (Fz_m*exptheta_m0).real
-
-        # Mode 1
-        # ----------------------------
-        # Clear the temporary variables
-        # for the "per mode" gathering
-        Fr_m = 0.j
-        Ft_m = 0.j
-        Fz_m = 0.j
-
-        # Add the fields for mode 1
-        index_r = 0
-        while index_r < 4:
-            index_z = 0
-            while index_z < 4:
-                if Sz[index_z]*Sr[index_r] < 0:
-                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Br_m1[iz[index_z], ir[index_r]]
-                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
-                        Bt_m1[iz[index_z], ir[index_r]]
-                else:
-                    Fr_m += Sz[index_z]*Sr[index_r]* \
-                        Br_m1[iz[index_z], ir[index_r]]
-                    Ft_m += Sz[index_z]*Sr[index_r]* \
-                        Bt_m1[iz[index_z], ir[index_r]]
-                Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
-                index_z += 1
-            index_r += 1
-
-        # Add the fields from the mode 1
-        Fr += 2*(Fr_m*exptheta_m1).real
-        Ft += 2*(Ft_m*exptheta_m1).real
-        Fz += 2*(Fz_m*exptheta_m1).real
-
-        # Convert to Cartesian coordinates
-        # and write to particle field arrays
-        Bx[i] = cos*Fr - sin*Ft
-        By[i] = sin*Fr + cos*Ft
-        Bz[i] = Fz
-
-    return Ex, Ey, Ez, Bx, By, Bz
\ No newline at end of file
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen, Kevin Peters
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines the field gathering methods linear and cubic order shapes
+on the CPU with threading
+"""
+from numba import prange, int64
+import numba
+import math
+import numpy as np
+
+# -----------------------
+# Field gathering linear
+# -----------------------
+
+@numba.njit(parallel=True)
+def gather_field_prange_linear(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba with multi-threading.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (linear).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Deposit the field per cell in parallel
+    for i in prange(x.shape[0]):
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt( xj**2 + yj**2 )
+        if (rj !=0. ) :
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else :
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get linear weights for the deposition
+        # --------------------------------------------
+        # Positions of the particles, in the cell unit
+        r_cell =  invdr*(rj - rmin) - 0.5
+        z_cell =  invdz*(zj - zmin) - 0.5
+        # Original index of the uppper and lower cell
+        ir_lower = int(math.floor( r_cell ))
+        ir_upper = ir_lower + 1
+        iz_lower = int(math.floor( z_cell ))
+        iz_upper = iz_lower + 1
+        # Linear weight
+        Sr_lower = ir_upper - r_cell
+        Sr_upper = r_cell - ir_lower
+        Sz_lower = iz_upper - z_cell
+        Sz_upper = z_cell - iz_lower
+        # Set guard weights to zero
+        Sr_guard = 0.
+
+        # Treat the boundary conditions
+        # --------------------------------------------
+        # guard cells in lower r
+        if ir_lower < 0:
+            Sr_guard = Sr_lower
+            Sr_lower = 0.
+            ir_lower = 0
+        # absorbing in upper r
+        if ir_lower > Nr-1:
+            ir_lower = Nr-1
+        if ir_upper > Nr-1:
+            ir_upper = Nr-1
+        # periodic boundaries in z
+        # lower z boundaries
+        if iz_lower < 0:
+            iz_lower += Nz
+        if iz_upper < 0:
+            iz_upper += Nz
+        # upper z boundaries
+        if iz_lower > Nz-1:
+            iz_lower -= Nz
+        if iz_upper > Nz-1:
+            iz_upper -= Nz
+
+        #Precalculate Shapes
+        S_ll = Sz_lower*Sr_lower
+        S_lu = Sz_lower*Sr_upper
+        S_ul = Sz_upper*Sr_lower
+        S_uu = Sz_upper*Sr_upper
+        S_lg = Sz_lower*Sr_guard
+        S_ug = Sz_upper*Sr_guard
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Er_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Et_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Ez_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Er_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Et_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Ez_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Er_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Et_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Ez_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Er_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Et_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Ez_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Er_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Et_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Ez_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Er_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Et_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Ez_m1[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Er_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Et_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Ez_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Er_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Et_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Ez_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = cos*Fr - sin*Ft
+        Ey[i] = sin*Fr + cos*Ft
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m0[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m0[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m0[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m0[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m0[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m0[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m0[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m0[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m0[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m0[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m0[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m0[ iz_upper, ir_upper ]
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m += -1. * S_lg * Br_m0[ iz_lower, 0]
+            Ft_m += -1. * S_lg * Bt_m0[ iz_lower, 0]
+            Fz_m +=  1. * S_lg * Bz_m0[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m += -1. * S_ug * Br_m0[ iz_upper, 0]
+            Ft_m += -1. * S_ug * Bt_m0[ iz_upper, 0]
+            Fz_m +=  1. * S_ug * Bz_m0[ iz_upper, 0]
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        # Lower cell in z, Lower cell in r
+        Fr_m += S_ll * Br_m1[ iz_lower, ir_lower ]
+        Ft_m += S_ll * Bt_m1[ iz_lower, ir_lower ]
+        Fz_m += S_ll * Bz_m1[ iz_lower, ir_lower ]
+        # Lower cell in z, Upper cell in r
+        Fr_m += S_lu * Br_m1[ iz_lower, ir_upper ]
+        Ft_m += S_lu * Bt_m1[ iz_lower, ir_upper ]
+        Fz_m += S_lu * Bz_m1[ iz_lower, ir_upper ]
+        # Upper cell in z, Lower cell in r
+        Fr_m += S_ul * Br_m1[ iz_upper, ir_lower ]
+        Ft_m += S_ul * Bt_m1[ iz_upper, ir_lower ]
+        Fz_m += S_ul * Bz_m1[ iz_upper, ir_lower ]
+        # Upper cell in z, Upper cell in r
+        Fr_m += S_uu * Br_m1[ iz_upper, ir_upper ]
+        Ft_m += S_uu * Bt_m1[ iz_upper, ir_upper ]
+        Fz_m += S_uu * Bz_m1[ iz_upper, ir_upper ]
+
+        # Add the fields from the guard cells
+        if ir_lower == ir_upper == 0:
+            # Lower cell in z
+            Fr_m +=  1. * S_lg * Br_m1[ iz_lower, 0]
+            Ft_m +=  1. * S_lg * Bt_m1[ iz_lower, 0]
+            Fz_m += -1. * S_lg * Bz_m1[ iz_lower, 0]
+            # Upper cell in z
+            Fr_m +=  1. * S_ug * Br_m1[ iz_upper, 0]
+            Ft_m +=  1. * S_ug * Bt_m1[ iz_upper, 0]
+            Fz_m += -1. * S_ug * Bz_m1[ iz_upper, 0]
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+    return Ex, Ey, Ez, Bx, By, Bz
+
+# -----------------------
+# Field gathering cubic
+# -----------------------
+
+@numba.njit(parallel=True)
+def gather_field_prange_cubic(x, y, z,
+                    invdz, zmin, Nz,
+                    invdr, rmin, Nr,
+                    Er_m0, Et_m0, Ez_m0,
+                    Er_m1, Et_m1, Ez_m1,
+                    Br_m0, Bt_m0, Bz_m0,
+                    Br_m1, Bt_m1, Bz_m1,
+                    Ex, Ey, Ez,
+                    Bx, By, Bz):
+    """
+    Gathering of the fields (E and B) using numba with multi-threading.
+    Iterates over the particles, calculates the weighted amount
+    of fields acting on each particle based on its shape (cubic).
+    Fields are gathered in cylindrical coordinates and then
+    transformed to cartesian coordinates.
+    Supports only mode 0 and 1.
+
+    Parameters
+    ----------
+    x, y, z : 1darray of floats (in meters)
+        The position of the particles
+
+    invdz, invdr : float (in meters^-1)
+        Inverse of the grid step along the considered direction
+
+    zmin, rmin : float (in meters)
+        Position of the edge of the simulation box along the
+        direction considered
+
+    Nz, Nr : int
+        Number of gridpoints along the considered direction
+
+    Er_m0, Et_m0, Ez_m0 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 0
+
+    Er_m1, Et_m1, Ez_m1 : 2darray of complexs
+        The electric fields on the interpolation grid for the mode 1
+
+    Br_m0, Bt_m0, Bz_m0 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 0
+
+    Br_m1, Bt_m1, Bz_m1 : 2darray of complexs
+        The magnetic fields on the interpolation grid for the mode 1
+
+    Ex, Ey, Ez : 1darray of floats
+        The electric fields acting on the particles
+        (is modified by this function)
+
+    Bx, By, Bz : 1darray of floats
+        The magnetic fields acting on the particles
+        (is modified by this function)
+    """
+    # Deposit the field per cell in parallel
+    # (for threads < number of particles)
+    for i in prange(x.shape[0]):
+        # Preliminary arrays for the cylindrical conversion
+        # --------------------------------------------
+        # Position
+        xj = x[i]
+        yj = y[i]
+        zj = z[i]
+
+        # Cylindrical conversion
+        rj = math.sqrt(xj**2 + yj**2)
+        if (rj != 0.):
+            invr = 1./rj
+            cos = xj*invr  # Cosine
+            sin = yj*invr  # Sine
+        else:
+            cos = 1.
+            sin = 0.
+        exptheta_m0 = 1.
+        exptheta_m1 = cos - 1.j*sin
+
+        # Get weights for the deposition
+        # --------------------------------------------
+        # Positions of the particle, in the cell unit
+        r_cell = invdr*(rj - rmin) - 0.5
+        z_cell = invdz*(zj - zmin) - 0.5
+
+        # Calculate the shape factors
+        ir = np.zeros(4, dtype=int64) #[0,0,0,0]
+        Sr = np.zeros(4) #[0.,0.,0.,0.]
+        ir[0] = int64(math.floor(r_cell)) - 1
+        ir[1] = ir[0] + 1
+        ir[2] = ir[1] + 1
+        ir[3] = ir[2] + 1
+        Sr[0] = -1./6. * ((r_cell-ir[0])-2)**3
+        Sr[1] = 1./6. * (3*((r_cell-ir[1])**3)-6*((r_cell-ir[1])**2)+4)
+        Sr[2] = 1./6. * (3*((ir[2]-r_cell)**3)-6*((ir[2]-r_cell)**2)+4)
+        Sr[3] = -1./6. * ((ir[3]-r_cell)-2)**3
+        iz = np.zeros(4, dtype=int64) #[0,0,0,0]
+        Sz = np.zeros(4) #[0.,0.,0.,0.]
+        iz[0] = int64(math.floor(z_cell)) - 1
+        iz[1] = iz[0] + 1
+        iz[2] = iz[1] + 1
+        iz[3] = iz[2] + 1
+        Sz[0] = -1./6. * ((z_cell-iz[0])-2)**3
+        Sz[1] = 1./6. * (3*((z_cell-iz[1])**3)-6*((z_cell-iz[1])**2)+4)
+        Sz[2] = 1./6. * (3*((iz[2]-z_cell)**3)-6*((iz[2]-z_cell)**2)+4)
+        Sz[3] = -1./6. * ((iz[3]-z_cell)-2)**3
+        # Lower and upper periodic boundary for z
+        index_z = 0
+        while index_z < 4:
+            if iz[index_z] < 0:
+                iz[index_z] += Nz
+            if iz[index_z] > Nz - 1:
+                iz[index_z] -= Nz
+            index_z += 1
+        # Lower and upper boundary for r
+        index_r = 0
+        while index_r < 4:
+            if ir[index_r] < 0:
+                ir[index_r] = abs(ir[index_r])-1
+                Sr[index_r] = (-1.)*Sr[index_r]
+            if ir[index_r] > Nr - 1:
+                ir[index_r] = Nr - 1
+            index_r += 1
+
+        # E-Field
+        # ----------------------------
+        # Define the initial placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                Fr_m += Sz[index_z]*Sr[index_r]*Er_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]*Et_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Ez_m0[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 1
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                                Er_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                                Et_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Ez_m1[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Ex[i] = (cos*Fr - sin*Ft)
+        Ey[i] = (sin*Fr + cos*Ft)
+        Ez[i] = Fz
+
+        # B-Field
+        # ----------------------------
+        # Clear the placeholders for the
+        # gathered field for each coordinate
+        Fr = 0.
+        Ft = 0.
+        Fz = 0.
+
+        # Mode 0
+        # ----------------------------
+        # Create temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+        # Add the fields for mode 0
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                Fr_m += Sz[index_z]*Sr[index_r]* \
+                    Br_m0[iz[index_z], ir[index_r]]
+                Ft_m += Sz[index_z]*Sr[index_r]* \
+                    Bt_m0[iz[index_z], ir[index_r]]
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fz_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+                else:
+                    Fz_m += Sz[index_z]*Sr[index_r]* \
+                        Bz_m0[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        # Add the fields from the mode 0
+        Fr += (Fr_m*exptheta_m0).real
+        Ft += (Ft_m*exptheta_m0).real
+        Fz += (Fz_m*exptheta_m0).real
+
+        # Mode 1
+        # ----------------------------
+        # Clear the temporary variables
+        # for the "per mode" gathering
+        Fr_m = 0.j
+        Ft_m = 0.j
+        Fz_m = 0.j
+
+        # Add the fields for mode 1
+        index_r = 0
+        while index_r < 4:
+            index_z = 0
+            while index_z < 4:
+                if Sz[index_z]*Sr[index_r] < 0:
+                    Fr_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += (-1.)*Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                else:
+                    Fr_m += Sz[index_z]*Sr[index_r]* \
+                        Br_m1[iz[index_z], ir[index_r]]
+                    Ft_m += Sz[index_z]*Sr[index_r]* \
+                        Bt_m1[iz[index_z], ir[index_r]]
+                Fz_m += Sz[index_z]*Sr[index_r]*Bz_m1[iz[index_z], ir[index_r]]
+                index_z += 1
+            index_r += 1
+
+        # Add the fields from the mode 1
+        Fr += 2*(Fr_m*exptheta_m1).real
+        Ft += 2*(Ft_m*exptheta_m1).real
+        Fz += 2*(Fz_m*exptheta_m1).real
+
+        # Convert to Cartesian coordinates
+        # and write to particle field arrays
+        Bx[i] = cos*Fr - sin*Ft
+        By[i] = sin*Fr + cos*Ft
+        Bz[i] = Fz
+
+    return Ex, Ey, Ez, Bx, By, Bz

From afd5d3b3aad1b04d9c0fceffd239926678662d74 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Fri, 21 Jul 2017 21:31:33 -0700
Subject: [PATCH 34/36] Replace tx_chunks by an array

---
 .../particles/deposition/threading_methods.py | 16 ++++----------
 fbpic/particles/particles.py                  | 21 ++++++++++++-------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 3aa895087..2c85e8236 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -125,9 +125,7 @@ def deposit_rho_prange_linear(x, y, z, w,
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
+        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
@@ -296,9 +294,7 @@ def deposit_J_prange_linear(x, y, z, w,
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
+        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
@@ -539,9 +535,7 @@ def deposit_rho_prange_cubic(x, y, z, w,
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
+        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
@@ -880,9 +874,7 @@ def deposit_J_prange_cubic(x, y, z, w,
     # Deposit the field per cell in parallel (for threads < number of cells)
     for tx in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for idx in range( tx_chunks[tx] ):
-            # Calculate thread local particle index
-            ptcl_idx = idx + tx*tx_chunks[0]
+        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
diff --git a/fbpic/particles/particles.py b/fbpic/particles/particles.py
index 8474d2dc2..63e4a871c 100644
--- a/fbpic/particles/particles.py
+++ b/fbpic/particles/particles.py
@@ -743,10 +743,15 @@ def deposit( self, fld, fieldtype ) :
                                   'rho', but is `%s`" % fieldtype)
         # CPU multi-threading version
         elif self.use_threading:
-            # Register particle chunk size for each thread
-            tx_N = int(self.Ntot/self.nthreads)
-            tx_chunks = [ tx_N for k in range(self.nthreads) ]
-            tx_chunks[-1] = tx_chunks[-1] + int(self.Ntot%self.nthreads)
+
+            # Divide particles in chunks (each chunk is handled by a different
+            # thread) and register the indices that bound each chunks
+            n_avg_per_thread = int( self.Ntot/self.nthreads )
+            # Attribute n_avg_per_thread to each thread (except the last one)
+            ptcl_chunk_indices = np.array(
+                [ i_chk*n_avg_per_thread for i_chk in range(self.nthreads+1) ],
+                dtype=np.uint64 )
+            ptcl_chunk_indices[-1] = self.Ntot
             # Multithreading functions for the deposition of rho or J
             # for Mode 0 and 1 only.
             if fieldtype == 'rho':
@@ -764,14 +769,14 @@ def deposit( self, fld, fieldtype ) :
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
                         rho_m0_global, rho_m1_global,
-                        self.nthreads, tx_chunks )
+                        self.nthreads, ptcl_chunk_indices )
                 elif self.particle_shape == 'cubic':
                     deposit_rho_prange_cubic(
                         self.x, self.y, self.z, self.w,
                         grid[0].invdz, grid[0].zmin, grid[0].Nz,
                         grid[0].invdr, grid[0].rmin, grid[0].Nr,
                         rho_m0_global, rho_m1_global,
-                        self.nthreads, tx_chunks )
+                        self.nthreads, ptcl_chunk_indices )
                 else:
                     raise ValueError("`particle_shape` should be either \
                                       'linear' or 'cubic' \
@@ -810,7 +815,7 @@ def deposit( self, fld, fieldtype ) :
                         Jr_m0_global, Jr_m1_global,
                         Jt_m0_global, Jt_m1_global,
                         Jz_m0_global, Jz_m1_global,
-                        self.nthreads, tx_chunks )
+                        self.nthreads, ptcl_chunk_indices )
                 elif self.particle_shape == 'cubic':
                     deposit_J_prange_cubic(
                         self.x, self.y, self.z, self.w,
@@ -820,7 +825,7 @@ def deposit( self, fld, fieldtype ) :
                         Jr_m0_global, Jr_m1_global,
                         Jt_m0_global, Jt_m1_global,
                         Jz_m0_global, Jz_m1_global,
-                        self.nthreads, tx_chunks )
+                        self.nthreads, ptcl_chunk_indices )
                 else:
                     raise ValueError("`particle_shape` should be either \
                                       'linear' or 'cubic' \

From d35a7c1eed3d068f30e07a0e0c7859bc62cb5472 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Fri, 21 Jul 2017 21:40:23 -0700
Subject: [PATCH 35/36] Changes in variable names and docstring

---
 .../particles/deposition/threading_methods.py | 433 +++++++++---------
 1 file changed, 214 insertions(+), 219 deletions(-)

diff --git a/fbpic/particles/deposition/threading_methods.py b/fbpic/particles/deposition/threading_methods.py
index 2c85e8236..4f38862a6 100644
--- a/fbpic/particles/deposition/threading_methods.py
+++ b/fbpic/particles/deposition/threading_methods.py
@@ -78,7 +78,7 @@ def deposit_rho_prange_linear(x, y, z, w,
                            invdz, zmin, Nz,
                            invdr, rmin, Nr,
                            rho_m0_global, rho_m1_global,
-                           nthreads, tx_chunks):
+                           nthreads, ptcl_chunk_indices):
     """
     Deposition of the charge density rho using numba prange on the CPU.
     Iterates over the threads in parallel, while each thread iterates
@@ -90,8 +90,6 @@ def deposit_rho_prange_linear(x, y, z, w,
     Calculates the weighted amount of rho that is deposited to the
     4 cells surounding the particle based on its shape (linear).
 
-    The rest of the execution is similar to the CUDA equivalent function.
-
     Parameters
     ----------
     x, y, z : 1darray of floats (in meters)
@@ -118,22 +116,23 @@ def deposit_rho_prange_linear(x, y, z, w,
     nthreads : int
         Number of CPU threads used with numba prange
 
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
     """
     # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
+    for i_thread in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
             # Weights
-            wj = w[ptcl_idx]
+            wj = w[i_ptcl]
 
             # Cylindrical conversion
             rj = math.sqrt(xj**2 + yj**2)
@@ -215,17 +214,17 @@ def deposit_rho_prange_linear(x, y, z, w,
                 shift_z -= Nz
 
             # Write ptcl fields to thread-local part of global deposition array
-            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_00
-            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_00
+            rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_00
+            rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_00
 
-            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m0_01
-            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += R_m1_01
+            rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m0_01
+            rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += R_m1_01
 
-            rho_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m0_10
-            rho_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += R_m1_10
+            rho_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m0_10
+            rho_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += R_m1_10
 
-            rho_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
-            rho_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
+            rho_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m0_11
+            rho_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += R_m1_11
 
     return
 
@@ -241,7 +240,7 @@ def deposit_J_prange_linear(x, y, z, w,
                          j_r_m0_global, j_r_m1_global,
                          j_t_m0_global, j_t_m1_global,
                          j_z_m0_global, j_z_m1_global,
-                         nthreads, tx_chunks):
+                         nthreads, ptcl_chunk_indices):
     """
     Deposition of the current density J using numba prange on the CPU.
     Iterates over the threads in parallel, while each thread iterates
@@ -253,8 +252,6 @@ def deposit_J_prange_linear(x, y, z, w,
     Calculates the weighted amount of J that is deposited to the
     4 cells surounding the particle based on its shape (linear).
 
-    The rest of the execution is similar to the CUDA equivalent function.
-
     Parameters
     ----------
     x, y, z : 1darray of floats (in meters)
@@ -287,28 +284,29 @@ def deposit_J_prange_linear(x, y, z, w,
     nthreads : int
         Number of CPU threads used with numba prange
 
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
     """
     # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
+    for i_thread in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
             # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
+            uxj = ux[i_ptcl]
+            uyj = uy[i_ptcl]
+            uzj = uz[i_ptcl]
             # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
+            inv_gammaj = inv_gamma[i_ptcl]
             # Weights
-            wj = w[ptcl_idx]
+            wj = w[i_ptcl]
 
             # Cylindrical conversion
             rj = math.sqrt(xj**2 + yj**2)
@@ -439,41 +437,41 @@ def deposit_J_prange_linear(x, y, z, w,
                 shift_z -= Nz
 
             # Write ptcl fields to thread-local part of global deposition array
-            j_r_m0_global[tx,iz_cell, ir_cell] += J_r_m0_00
-            j_r_m1_global[tx,iz_cell, ir_cell] += J_r_m1_00
+            j_r_m0_global[i_thread,iz_cell, ir_cell] += J_r_m0_00
+            j_r_m1_global[i_thread,iz_cell, ir_cell] += J_r_m1_00
 
-            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
-            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
+            j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m0_01
+            j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_r_m1_01
 
-            j_r_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
-            j_r_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
+            j_r_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m0_10
+            j_r_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_r_m1_10
 
-            j_r_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
-            j_r_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
+            j_r_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m0_11
+            j_r_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_r_m1_11
 
-            j_t_m0_global[tx,iz_cell, ir_cell] += J_t_m0_00
-            j_t_m1_global[tx,iz_cell, ir_cell] += J_t_m1_00
+            j_t_m0_global[i_thread,iz_cell, ir_cell] += J_t_m0_00
+            j_t_m1_global[i_thread,iz_cell, ir_cell] += J_t_m1_00
 
-            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
-            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
+            j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m0_01
+            j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_t_m1_01
 
-            j_t_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
-            j_t_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
+            j_t_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m0_10
+            j_t_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_t_m1_10
 
-            j_t_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
-            j_t_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
+            j_t_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m0_11
+            j_t_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_t_m1_11
 
-            j_z_m0_global[tx,iz_cell, ir_cell] += J_z_m0_00
-            j_z_m1_global[tx,iz_cell, ir_cell] += J_z_m1_00
+            j_z_m0_global[i_thread,iz_cell, ir_cell] += J_z_m0_00
+            j_z_m1_global[i_thread,iz_cell, ir_cell] += J_z_m1_00
 
-            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
-            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
+            j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m0_01
+            j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell] += J_z_m1_01
 
-            j_z_m0_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
-            j_z_m1_global[tx,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
+            j_z_m0_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m0_10
+            j_z_m1_global[i_thread,iz_cell, ir_cell+1 + shift_r] += J_z_m1_10
 
-            j_z_m0_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
-            j_z_m1_global[tx,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
+            j_z_m0_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m0_11
+            j_z_m1_global[i_thread,iz_cell+1 + shift_z, ir_cell+1 + shift_r] += J_z_m1_11
 
     return
 
@@ -487,9 +485,8 @@ def deposit_rho_prange_cubic(x, y, z, w,
                           invdz, zmin, Nz,
                           invdr, rmin, Nr,
                           rho_m0_global, rho_m1_global,
-                          nthreads, tx_chunks):
+                          nthreads, ptcl_chunk_indices):
     """
-
     Deposition of the charge density rho using numba prange on the CPU.
     Iterates over the threads in parallel, while each thread iterates
     over a batch of particles. Intermediate results for each threads are
@@ -500,8 +497,6 @@ def deposit_rho_prange_cubic(x, y, z, w,
     Calculates the weighted amount of rho that is deposited to the
     16 cells surounding the particle based on its shape (cubic).
 
-    The rest of the execution is similar to the CUDA equivalent function.
-
     Parameters
     ----------
     x, y, z : 1darray of floats (in meters)
@@ -528,22 +523,23 @@ def deposit_rho_prange_cubic(x, y, z, w,
     nthreads : int
         Number of CPU threads used with numba prange
 
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
     """
     # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
+    for i_thread in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
             # Weights
-            wj = w[ptcl_idx]
+            wj = w[i_ptcl]
 
             # Cylindrical conversion
             rj = math.sqrt(xj**2 + yj**2)
@@ -774,38 +770,38 @@ def deposit_rho_prange_cubic(x, y, z, w,
                 srl = 1
 
             # Write ptcl fields to thread-local part of global deposition array
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
-            rho_m0_global[tx, iz_cell, ir_cell - 1 + srl] += R_m0_01
-            rho_m1_global[tx, iz_cell, ir_cell - 1 + srl] += R_m1_01
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell] += R_m0_10
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell] += R_m1_10
-            rho_m0_global[tx, iz_cell, ir_cell] += R_m0_11
-            rho_m1_global[tx, iz_cell, ir_cell] += R_m1_11
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell] += R_m0_12
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell] += R_m1_12
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m0_13
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += R_m1_13
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
-            rho_m0_global[tx, iz_cell, ir_cell + 1 + sru] += R_m0_21
-            rho_m1_global[tx, iz_cell, ir_cell + 1 + sru] += R_m1_21
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
-            rho_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
-            rho_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
-            rho_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m0_31
-            rho_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += R_m1_31
-            rho_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
-            rho_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
-            rho_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
-            rho_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m0_00
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += R_m1_00
+            rho_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m0_01
+            rho_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += R_m1_01
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m0_02
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += R_m1_02
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m0_03
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += R_m1_03
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m0_10
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell] += R_m1_10
+            rho_m0_global[i_thread, iz_cell, ir_cell] += R_m0_11
+            rho_m1_global[i_thread, iz_cell, ir_cell] += R_m1_11
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m0_12
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += R_m1_12
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m0_13
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += R_m1_13
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m0_20
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += R_m1_20
+            rho_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m0_21
+            rho_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += R_m1_21
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m0_22
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += R_m1_22
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m0_23
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += R_m1_23
+            rho_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m0_30
+            rho_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += R_m1_30
+            rho_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m0_31
+            rho_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += R_m1_31
+            rho_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m0_32
+            rho_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += R_m1_32
+            rho_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m0_33
+            rho_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += R_m1_33
 
     return
 
@@ -821,7 +817,7 @@ def deposit_J_prange_cubic(x, y, z, w,
                         j_r_m0_global, j_r_m1_global,
                         j_t_m0_global, j_t_m1_global,
                         j_z_m0_global, j_z_m1_global,
-                        nthreads, tx_chunks):
+                        nthreads, ptcl_chunk_indices):
     """
     Deposition of the current density J using numba prange on the CPU.
     Iterates over the threads in parallel, while each thread iterates
@@ -833,8 +829,6 @@ def deposit_J_prange_cubic(x, y, z, w,
     Calculates the weighted amount of J that is deposited to the
     16 cells surounding the particle based on its shape (cubic).
 
-    The rest of the execution is similar to the CUDA equivalent function.
-
     Parameters
     ----------
     x, y, z : 1darray of floats (in meters)
@@ -867,28 +861,29 @@ def deposit_J_prange_cubic(x, y, z, w,
     nthreads : int
         Number of CPU threads used with numba prange
 
-    tx_chunks : list of int
-        Exact particle batch size per thread.
-        Last batch size can be greater if Ntot is not a multiple of nthreads.
+    ptcl_chunk_indices : array of int, of size nthreads+1
+        The indices (of the particle array) between which each thread
+        should loop. (i.e. divisions of particle array between threads)
     """
     # Deposit the field per cell in parallel (for threads < number of cells)
-    for tx in prange( nthreads ):
+    for i_thread in prange( nthreads ):
         # Loop over all particles in thread chunk
-        for ptcl_idx in range( tx_chunks[tx], tx_chunks[tx+1] ):
+        for i_ptcl in range( ptcl_chunk_indices[i_thread],
+                             ptcl_chunk_indices[i_thread+1] ):
             # Preliminary arrays for the cylindrical conversion
             # --------------------------------------------
             # Position
-            xj = x[ptcl_idx]
-            yj = y[ptcl_idx]
-            zj = z[ptcl_idx]
+            xj = x[i_ptcl]
+            yj = y[i_ptcl]
+            zj = z[i_ptcl]
             # Velocity
-            uxj = ux[ptcl_idx]
-            uyj = uy[ptcl_idx]
-            uzj = uz[ptcl_idx]
+            uxj = ux[i_ptcl]
+            uyj = uy[i_ptcl]
+            uzj = uz[i_ptcl]
             # Inverse gamma
-            inv_gammaj = inv_gamma[ptcl_idx]
+            inv_gammaj = inv_gamma[i_ptcl]
             # Weights
-            wj = w[ptcl_idx]
+            wj = w[i_ptcl]
 
             # Cylindrical conversion
             rj = math.sqrt(xj**2 + yj**2)
@@ -1402,104 +1397,104 @@ def deposit_J_prange_cubic(x, y, z, w,
             if (ir_cell-1) < 0:
                 srl = 1
 
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
-            j_r_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m0_01
-            j_r_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_r_m1_01
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
-            j_r_m0_global[tx, iz_cell, ir_cell] += J_r_m0_11
-            j_r_m1_global[tx, iz_cell, ir_cell] += J_r_m1_11
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m0_12
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_r_m1_12
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
-            j_r_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m0_21
-            j_r_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_r_m1_21
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
-            j_r_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
-            j_r_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
-            j_r_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
-            j_r_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
-            j_r_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
-            j_r_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
-            j_r_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
-            j_r_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
-
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
-            j_t_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m0_01
-            j_t_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_t_m1_01
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
-            j_t_m0_global[tx, iz_cell, ir_cell] += J_t_m0_11
-            j_t_m1_global[tx, iz_cell, ir_cell] += J_t_m1_11
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m0_12
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_t_m1_12
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
-            j_t_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m0_21
-            j_t_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_t_m1_21
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
-            j_t_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
-            j_t_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
-            j_t_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
-            j_t_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
-            j_t_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
-            j_t_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
-            j_t_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
-            j_t_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
-
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
-            j_z_m0_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m0_01
-            j_z_m1_global[tx, iz_cell, ir_cell - 1 + srl] += J_z_m1_01
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
-            j_z_m0_global[tx, iz_cell, ir_cell] += J_z_m0_11
-            j_z_m1_global[tx, iz_cell, ir_cell] += J_z_m1_11
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m0_12
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell] += J_z_m1_12
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
-            j_z_m0_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m0_21
-            j_z_m1_global[tx, iz_cell, ir_cell + 1 + sru] += J_z_m1_21
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
-            j_z_m0_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
-            j_z_m1_global[tx, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
-            j_z_m0_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
-            j_z_m1_global[tx, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
-            j_z_m0_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
-            j_z_m1_global[tx, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
-            j_z_m0_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
-            j_z_m1_global[tx, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m0_00
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_r_m1_00
+            j_r_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m0_01
+            j_r_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_r_m1_01
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m0_02
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_r_m1_02
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m0_03
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_r_m1_03
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m0_10
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_r_m1_10
+            j_r_m0_global[i_thread, iz_cell, ir_cell] += J_r_m0_11
+            j_r_m1_global[i_thread, iz_cell, ir_cell] += J_r_m1_11
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m0_12
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_r_m1_12
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m0_13
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_r_m1_13
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m0_20
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_r_m1_20
+            j_r_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m0_21
+            j_r_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_r_m1_21
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m0_22
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_r_m1_22
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m0_23
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_r_m1_23
+            j_r_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m0_30
+            j_r_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_r_m1_30
+            j_r_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m0_31
+            j_r_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_r_m1_31
+            j_r_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m0_32
+            j_r_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_r_m1_32
+            j_r_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m0_33
+            j_r_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_r_m1_33
+
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m0_00
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_t_m1_00
+            j_t_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m0_01
+            j_t_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_t_m1_01
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m0_02
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_t_m1_02
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m0_03
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_t_m1_03
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m0_10
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_t_m1_10
+            j_t_m0_global[i_thread, iz_cell, ir_cell] += J_t_m0_11
+            j_t_m1_global[i_thread, iz_cell, ir_cell] += J_t_m1_11
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m0_12
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_t_m1_12
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m0_13
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_t_m1_13
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m0_20
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_t_m1_20
+            j_t_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m0_21
+            j_t_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_t_m1_21
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m0_22
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_t_m1_22
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m0_23
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_t_m1_23
+            j_t_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m0_30
+            j_t_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_t_m1_30
+            j_t_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m0_31
+            j_t_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_t_m1_31
+            j_t_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m0_32
+            j_t_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_t_m1_32
+            j_t_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m0_33
+            j_t_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_t_m1_33
+
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m0_00
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell - 1 + srl] += J_z_m1_00
+            j_z_m0_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m0_01
+            j_z_m1_global[i_thread, iz_cell, ir_cell - 1 + srl] += J_z_m1_01
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m0_02
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell - 1 + srl] += J_z_m1_02
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m0_03
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell - 1 + srl] += J_z_m1_03
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m0_10
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell ] += J_z_m1_10
+            j_z_m0_global[i_thread, iz_cell, ir_cell] += J_z_m0_11
+            j_z_m1_global[i_thread, iz_cell, ir_cell] += J_z_m1_11
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m0_12
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell] += J_z_m1_12
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m0_13
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell] += J_z_m1_13
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m0_20
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 1 + sru] += J_z_m1_20
+            j_z_m0_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m0_21
+            j_z_m1_global[i_thread, iz_cell, ir_cell + 1 + sru] += J_z_m1_21
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m0_22
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 1 + sru] += J_z_m1_22
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m0_23
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 1 + sru] += J_z_m1_23
+            j_z_m0_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m0_30
+            j_z_m1_global[i_thread, iz_cell - 1 + szl, ir_cell + 2 + sru2] += J_z_m1_30
+            j_z_m0_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m0_31
+            j_z_m1_global[i_thread, iz_cell, ir_cell + 2 + sru2] += J_z_m1_31
+            j_z_m0_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m0_32
+            j_z_m1_global[i_thread, iz_cell + 1 + szu, ir_cell + 2 + sru2] += J_z_m1_32
+            j_z_m0_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m0_33
+            j_z_m1_global[i_thread, iz_cell + 2 + szu2, ir_cell + 2 + sru2] += J_z_m1_33
 
     return
 

From 62045854951d505d72d18c361aba02c9fe9170ce Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Fri, 21 Jul 2017 21:54:17 -0700
Subject: [PATCH 36/36] Removed all mentions of linear_non_atomic

---
 fbpic/main.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fbpic/main.py b/fbpic/main.py
index 647d9c287..6c3e6c56f 100644
--- a/fbpic/main.py
+++ b/fbpic/main.py
@@ -182,10 +182,8 @@ def dens_func( z, r ) ...
 
         particle_shape: str, optional
             Set the particle shape for the charge/current deposition.
-            Possible values are 'cubic', 'linear' and 'linear_non_atomic'.
-            While 'cubic' corresponds to third order shapes and 'linear'
-            to first order shapes, 'linear_non_atomic' uses an equivalent
-            deposition scheme to 'linear' which avoids atomics on the GPU.
+            Possible values are 'cubic', 'linear'. ('cubic' corresponds to
+            third order shapes and 'linear' to first order shapes).
         """
         # Check whether to use CUDA
         self.use_cuda = use_cuda