fbpic · MKirchen · Jul 20, 2017 · Jul 19, 2017 · Jul 19, 2017 · Jul 19, 2017
diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
@@ -5,21 +5,18 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines the optimized fields methods that use numba on a CPU
 """
-import numba
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
+from fbpic.threading_utils import njit_parallel, prange
 
-@numba.jit('void(complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64, int32, int32)')
+@njit_parallel
 def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2, inv_dt, Nz, Nr ):
     """
     Correct the currents in spectral space, using the standard pstad
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -33,13 +30,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \
-           int8, int32, int32)')
+    return
+
+@njit_parallel
 def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -50,8 +43,8 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
 
     See the documentation of SpectralGrid.push_eb_with
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Save the electric fields, since it is needed for the B push
@@ -106,7 +99,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
 
-@numba.jit
+    return
+
+@njit_parallel
 def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2,
                             j_corr_coef, T_eb, T_cc,
@@ -115,8 +110,8 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
     Correct the currents in spectral space, using the assumption
     of comoving currents
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -130,7 +125,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit
+    return
+
+@njit_parallel
 def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -207,3 +204,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                             + 1.j*kr[iz, ir]*Em_old ) \
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
+
+    return
diff --git a/fbpic/fields/spectral_transform/spectral_transformer.py b/fbpic/fields/spectral_transform/spectral_transformer.py
@@ -9,6 +9,7 @@
 from .hankel import DHT
 from .fourier import FFT
 
+from .threading_methods import numba_rt_to_pm, numba_pm_to_rt
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m,
                 self.spect_buffer_r, self.spect_buffer_t )
         else :
             # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy)
-            self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \
-                    ( self.spect_buffer_p + self.spect_buffer_m), \
-                1.j*( self.spect_buffer_p - self.spect_buffer_m)
+            # (self.spect_buffer_r and self.spect_buffer_t are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_p, self.spect_buffer_m,
+            # for economy of memory)
+            numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m,
+                            self.spect_buffer_r, self.spect_buffer_t )
 
         # Finally perform the FFT (along axis 0, which corresponds to z)
         self.fft.inverse_transform( self.spect_buffer_r, interp_array_r )
@@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t,
                 self.spect_buffer_r, self.spect_buffer_t,
                 self.spect_buffer_p, self.spect_buffer_m )
         else :
-            # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy.)
-            self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \
-                0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \
-                0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t )
+            # Combine them on the GPU
+            # (self.spect_buffer_p and self.spect_buffer_m are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_r, self.spect_buffer_t,
+            # for economy of memory)
+            numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t,
+                            self.spect_buffer_p, self.spect_buffer_m )
 
         # Perform the inverse DHT (along axis -1, which corresponds to r)
         self.dhtp.transform( self.spect_buffer_p, spect_array_p )

diff --git a/fbpic/fields/spectral_transform/threading_methods.py b/fbpic/fields/spectral_transform/threading_methods.py
@@ -0,0 +1,58 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines a set of functions that are useful when converting the
+fields from interpolation grid to the spectral grid and vice-versa
+"""
+from fbpic.threading_utils import prange, njit_parallel
+
+# ----------------------------------------------------
+# Functions that combine components in spectral space
+# ----------------------------------------------------
+
+@njit_parallel
+def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
+    """
+    Combine the arrays buffer_r and buffer_t to produce the
+    arrays buffer_p and buffer_m, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_r.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_r = buffer_r[iz, ir]
+            value_t = buffer_t[iz, ir]
+            # Combine the values
+            buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t )
+            buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t )
+
+
+@njit_parallel
+def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) :
+    """
+    Combine the arrays buffer_p and buffer_m to produce the
+    arrays buffer_r and buffer_t, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_p.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_p = buffer_p[iz, ir]
+            value_m = buffer_m[iz, ir]
+            # Combine the values
+            buffer_r[iz, ir] =     ( value_p + value_m )
+            buffer_t[iz, ir] = 1.j*( value_p - value_m )
diff --git a/fbpic/main.py b/fbpic/main.py
@@ -11,6 +11,8 @@
 # as it sets the cuda context)
 from mpi4py import MPI
 import numba
+# Check if threading is available
+from .threading_utils import threading_enabled
 # Check if CUDA is available, then import CUDA functions
 from .cuda_utils import cuda_installed
 if cuda_installed:
@@ -43,9 +45,8 @@ def __init__(self, Nz, zmax, Nr, rmax, Nm, dt, p_zmin, p_zmax,
                  p_rmin, p_rmax, p_nz, p_nr, p_nt, n_e, zmin=0.,
                  n_order=-1, dens_func=None, filter_currents=True,
                  v_comoving=None, use_galilean=False, initialize_ions=False,
-                 use_cuda=False, use_threading=True, nthreads=None,
-                 n_guard=None, n_damp=30, exchange_period=None, 
-                 boundaries='periodic', gamma_boost=None, 
+                 use_cuda=False, n_guard=None, n_damp=30, exchange_period=None,
+                 boundaries='periodic', gamma_boost=None,
                  use_all_mpi_ranks=True, particle_shape='linear' ):
         """
         Initializes a simulation, by creating the following structures:
@@ -132,12 +133,6 @@ def dens_func( z, r ) ...
 
         use_cuda: bool, optional
             Wether to use CUDA (GPU) acceleration
-        use_threading : bool, optional
-            Wether to use multi-threading on the CPU.
-        nthreads: int, optional
-            Number of CPU multi-threading threads used (if use_threading
-            is set). If nthreads is set to None, the number of threads
-            are automatically determined.
 
         n_guard: int, optional
             Number of guard cells to use at the left and right of
@@ -199,16 +194,8 @@ def dens_func( z, r ) ...
             print('*** Performing the simulation on CPU.')
             self.use_cuda = False
         # CPU multi-threading
-        self.use_threading = use_threading
-        if self.use_threading:
-            # Define number of threads used
-            if nthreads is not None:
-                # Automatically take numba preset for number of threads
-                self.nthreads = nthreads
-                numba.config.NUMBA_NUM_THREADS = self.nthreads
-            else:
-                # Set user-defined number of threads
-                self.nthreads = numba.config.NUMBA_NUM_THREADS
+        self.use_threading = threading_enabled
+
         # Register the comoving parameters
         self.v_comoving = v_comoving
         self.use_galilean = use_galilean
@@ -254,16 +241,14 @@ def dens_func( z, r ) ...
                       zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
                       Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
                       grid_shape=grid_shape, particle_shape=particle_shape,
-                      use_cuda=self.use_cuda,
-                      use_threading=self.use_threading) ]
+                      use_cuda=self.use_cuda ) ]
         if initialize_ions :
             self.ptcl.append(
                 Particles(q=e, m=m_p, n=n_e, Npz=Npz, zmin=p_zmin,
                           zmax=p_zmax, Npr=Npr, rmin=p_rmin, rmax=p_rmax,
                           Nptheta=p_nt, dt=dt, dens_func=dens_func, uz_m=uz_m,
                           grid_shape=grid_shape, particle_shape=particle_shape,
-                          use_cuda=self.use_cuda,
-                          use_threading=self.use_threading) )
+                          use_cuda=self.use_cuda ) )
 
         # Register the number of particles per cell along z, and dt
         # (Necessary for the moving window)
@@ -620,7 +605,7 @@ def print_simulation_setup( comm, use_cuda, use_threading ):
         if use_threading and not use_cuda:
             message += " (%d threads per proc)" %numba.config.NUMBA_NUM_THREADS
         message += ".\n"
-            
+
         print( message )
 
 def adapt_to_grid( x, p_xmin, p_xmax, p_nx, ncells_empty=0 ):