Merge pull request #80 from fbpic/cpuprange

Implementation of CPU multi-threading
fbpic · Jul 22, 2017 · 68d4554 · 68d4554
2 parents 5897555 + 6204585
commit 68d4554
Show file tree

Hide file tree

Showing 27 changed files with 4,880 additions and 3,236 deletions.
diff --git a/fbpic/boundaries/moving_window.py b/fbpic/boundaries/moving_window.py
@@ -9,6 +9,7 @@
 from scipy.constants import c
 from fbpic.particles import Particles
 from fbpic.lpa_utils.boosted_frame import BoostConverter
+from fbpic.threading_utils import njit_parallel, prange
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -318,86 +319,36 @@ def shift_spect_grid( self, grid, n_move,
         """
         if grid.use_cuda:
             shift = grid.d_field_shift
+            # Get a 2D CUDA grid of the size of the grid
+            tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] )
             # Shift all the fields on the GPU
-            self.shift_spect_field_gpu( grid.Ep, shift, n_move )
-            self.shift_spect_field_gpu( grid.Em, shift, n_move )
-            self.shift_spect_field_gpu( grid.Ez, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bp, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bm, shift, n_move )
-            self.shift_spect_field_gpu( grid.Bz, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move )
+            shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move )
             if shift_rho:
-                self.shift_spect_field_gpu( grid.rho_prev, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move )
             if shift_currents:
-                self.shift_spect_field_gpu( grid.Jp, shift, n_move )
-                self.shift_spect_field_gpu( grid.Jm, shift, n_move )
-                self.shift_spect_field_gpu( grid.Jz, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move )
+                shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move )
         else:
             shift = grid.field_shift
             # Shift all the fields on the CPU
-            self.shift_spect_field( grid.Ep, shift, n_move )
-            self.shift_spect_field( grid.Em, shift, n_move )
-            self.shift_spect_field( grid.Ez, shift, n_move )
-            self.shift_spect_field( grid.Bp, shift, n_move )
-            self.shift_spect_field( grid.Bm, shift, n_move )
-            self.shift_spect_field( grid.Bz, shift, n_move )
+            shift_spect_array_cpu( grid.Ep, shift, n_move )
+            shift_spect_array_cpu( grid.Em, shift, n_move )
+            shift_spect_array_cpu( grid.Ez, shift, n_move )
+            shift_spect_array_cpu( grid.Bp, shift, n_move )
+            shift_spect_array_cpu( grid.Bm, shift, n_move )
+            shift_spect_array_cpu( grid.Bz, shift, n_move )
             if shift_rho:
-                self.shift_spect_field( grid.rho_prev, shift, n_move )
+                shift_spect_array_cpu( grid.rho_prev, shift, n_move )
             if shift_currents:
-                self.shift_spect_field( grid.Jp, shift, n_move )
-                self.shift_spect_field( grid.Jm, shift, n_move )
-                self.shift_spect_field( grid.Jz, shift, n_move )
-
-    def shift_spect_field( self, field_array, shift_factor, n_move ):
-        """
-        Shift the field 'field_array' by n_move cells.
-        This is done in spectral space and corresponds to multiplying the
-        fields with the factor exp(i*kz_true*dz)**n_move .
-        (Typically n_move is positive, and the fields are shifted backwards)
-
-        Parameters
-        ----------
-        field_array: 2darray of complexs
-            Contains the value of the fields, and is modified by
-            this function
-
-        shift_factor: 1darray of complexs
-            Contains the shift array, that is multiplied to the fields in
-            spectral space to shift them by one cell in spatial space
-            ( exp(i*kz_true*dz) )
-
-        n_move: int
-            The number of cells by which the grid should be shifted
-        """
-        # Multiply with (shift_factor*sign(n_move))**n_move
-        field_array *= ( shift_factor[:, np.newaxis] )**n_move
-
-    def shift_spect_field_gpu( self, field_array, shift_factor, n_move):
-        """
-        Shift the field 'field_array' by n_move cells on the GPU.
-        This is done in spectral space and corresponds to multiplying the
-        fields with the factor exp(i*kz_true*dz)**n_move .
-        (Typically n_move is positive, and the fields are shifted backwards)
-
-        Parameters
-        ----------
-        field_array: 2darray of complexs
-            Contains the value of the fields, and is modified by
-            this function
-
-        shift_factor: 1darray of complexs
-            Contains the shift array, that is multiplied to the fields in
-            spectral space to shift them by one cell in spatial space
-            ( exp(i*kz_true*dz) )
-
-        n_move: int
-            The number of cells by which the grid should be shifted
-        """
-        # Get a 2D CUDA grid of the size of the grid
-        dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d(
-            field_array.shape[0], field_array.shape[1] )
-        # Shift the field array in place
-        shift_spect_array_gpu[dim_grid_2d, dim_block_2d](
-            field_array, shift_factor, n_move)
+                shift_spect_array_cpu( grid.Jp, shift, n_move )
+                shift_spect_array_cpu( grid.Jm, shift, n_move )
+                shift_spect_array_cpu( grid.Jz, shift, n_move )
 
     def shift_interp_grid( self, grid, n_move,
                            shift_rho=True, shift_currents=False ):
@@ -513,6 +464,39 @@ def shift_interp_field_gpu( self, field_array, n_move):
         # Return the new shifted field array
         return( field_array )
 
+@njit_parallel
+def shift_spect_array_cpu( field_array, shift_factor, n_move ):
+    """
+    Shift the field 'field_array' by n_move cells on CPU.
+    This is done in spectral space and corresponds to multiplying the
+    fields with the factor exp(i*kz_true*dz)**n_move .
+
+    Parameters
+    ----------
+    field_array: 2darray of complexs
+        Contains the value of the fields, and is modified by
+        this function
+
+    shift_factor: 1darray of complexs
+        Contains the shift array, that is multiplied to the fields in
+        spectral space to shift them by one cell in spatial space
+        ( exp(i*kz_true*dz) )
+
+    n_move: int
+        The number of cells by which the grid should be shifted
+    """
+    Nz, Nr = field_array.shape
+
+    # Loop over the 2D array (in parallel over z if threading is enabled)
+    for iz in prange( Nz ):
+        power_shift = shift_factor[iz]
+        # Calculate the shift factor (raising to the power n_move)
+        for i in range(1,n_move):
+            power_shift *= shift_factor[iz]
+        # Shift fields backwards
+        for ir in range( Nr ):
+            field_array[iz, ir] *= power_shift
+
 if cuda_installed:
 
     @cuda.jit('void(complex128[:,:], complex128[:,:], int32)')

diff --git a/fbpic/fields/numba_methods.py b/fbpic/fields/numba_methods.py
@@ -5,21 +5,18 @@
 This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
 It defines the optimized fields methods that use numba on a CPU
 """
-import numba
 from scipy.constants import c, epsilon_0, mu_0
 c2 = c**2
+from fbpic.threading_utils import njit_parallel, prange
 
-@numba.jit('void(complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64, int32, int32)')
+@njit_parallel
 def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2, inv_dt, Nz, Nr ):
     """
     Correct the currents in spectral space, using the standard pstad
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -33,13 +30,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], complex128[:,:], \
-           complex128[:,:], complex128[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], \
-           float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \
-           int8, int32, int32)')
+    return
+
+@njit_parallel
 def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -50,8 +43,8 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
 
     See the documentation of SpectralGrid.push_eb_with
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Save the electric fields, since it is needed for the B push
@@ -106,7 +99,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
 
-@numba.jit
+    return
+
+@njit_parallel
 def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
                             kz, kr, inv_k2,
                             j_corr_coef, T_eb, T_cc,
@@ -115,8 +110,8 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
     Correct the currents in spectral space, using the assumption
     of comoving currents
     """
-    # Loop over the 2D grid
-    for iz in range(Nz):
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
         for ir in range(Nr):
 
             # Calculate the intermediate variable F
@@ -130,7 +125,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
             Jm[iz, ir] += -0.5 * kr[iz, ir] * F
             Jz[iz, ir] += -1.j * kz[iz, ir] * F
 
-@numba.jit
+    return
+
+@njit_parallel
 def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                        rho_prev, rho_next,
                        rho_prev_coef, rho_next_coef, j_coef,
@@ -207,3 +204,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
                             + 1.j*kr[iz, ir]*Em_old ) \
                 + j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
                             + 1.j*kr[iz, ir]*Jm[iz, ir] )
+
+    return
diff --git a/fbpic/fields/spectral_transform/spectral_transformer.py b/fbpic/fields/spectral_transform/spectral_transformer.py
@@ -9,6 +9,7 @@
 from .hankel import DHT
 from .fourier import FFT
 
+from .threading_methods import numba_rt_to_pm, numba_pm_to_rt
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed
 if cuda_installed:
@@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m,
                 self.spect_buffer_r, self.spect_buffer_t )
         else :
             # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy)
-            self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \
-                    ( self.spect_buffer_p + self.spect_buffer_m), \
-                1.j*( self.spect_buffer_p - self.spect_buffer_m)
+            # (self.spect_buffer_r and self.spect_buffer_t are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_p, self.spect_buffer_m,
+            # for economy of memory)
+            numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m,
+                            self.spect_buffer_r, self.spect_buffer_t )
 
         # Finally perform the FFT (along axis 0, which corresponds to z)
         self.fft.inverse_transform( self.spect_buffer_r, interp_array_r )
@@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t,
                 self.spect_buffer_r, self.spect_buffer_t,
                 self.spect_buffer_p, self.spect_buffer_m )
         else :
-            # Combine them on the CPU
-            # (It is important to write the affectation in the following way,
-            # since self.spect_buffer_p and self.spect_buffer_r actually point
-            # to the same object, for memory economy.)
-            self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \
-                0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \
-                0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t )
+            # Combine them on the GPU
+            # (self.spect_buffer_p and self.spect_buffer_m are
+            # passed in the following line, in order to make things
+            # explicit, but they actually point to the same object
+            # as self.spect_buffer_r, self.spect_buffer_t,
+            # for economy of memory)
+            numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t,
+                            self.spect_buffer_p, self.spect_buffer_m )
 
         # Perform the inverse DHT (along axis -1, which corresponds to r)
         self.dhtp.transform( self.spect_buffer_p, spect_array_p )

diff --git a/fbpic/fields/spectral_transform/threading_methods.py b/fbpic/fields/spectral_transform/threading_methods.py
@@ -0,0 +1,58 @@
+# Copyright 2016, FBPIC contributors
+# Authors: Remi Lehe, Manuel Kirchen
+# License: 3-Clause-BSD-LBNL
+"""
+This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
+It defines a set of functions that are useful when converting the
+fields from interpolation grid to the spectral grid and vice-versa
+"""
+from fbpic.threading_utils import prange, njit_parallel
+
+# ----------------------------------------------------
+# Functions that combine components in spectral space
+# ----------------------------------------------------
+
+@njit_parallel
+def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
+    """
+    Combine the arrays buffer_r and buffer_t to produce the
+    arrays buffer_p and buffer_m, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_r.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_r = buffer_r[iz, ir]
+            value_t = buffer_t[iz, ir]
+            # Combine the values
+            buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t )
+            buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t )
+
+
+@njit_parallel
+def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) :
+    """
+    Combine the arrays buffer_p and buffer_m to produce the
+    arrays buffer_r and buffer_t, according to the rules of
+    the Fourier-Hankel decomposition (see associated paper)
+    """
+    Nz, Nr = buffer_p.shape
+
+    # Loop over the 2D grid (parallel in z, if threading is installed)
+    for iz in prange(Nz):
+        for ir in range(Nr):
+
+            # Use intermediate variables, as the arrays
+            # buffer_r and buffer_t may actually point to the same
+            # object as buffer_p and buffer_m, for economy of memory
+            value_p = buffer_p[iz, ir]
+            value_m = buffer_m[iz, ir]
+            # Combine the values
+            buffer_r[iz, ir] =     ( value_p + value_m )
+            buffer_t[iz, ir] = 1.j*( value_p - value_m )
diff --git a/fbpic/lpa_utils/laser/antenna.py b/fbpic/lpa_utils/laser/antenna.py
@@ -10,8 +10,8 @@
 from scipy.constants import e, c, epsilon_0, physical_constants
 r_e = physical_constants['classical electron radius'][0]
 from .profiles import gaussian_profile
-from fbpic.particles.utility_methods import weights
-from fbpic.particles.numba_methods import deposit_field_numba
+from fbpic.particles.utilities.utility_methods import weights
+from fbpic.particles.deposition.numba_methods import deposit_field_numba
 
 # Check if CUDA is available, then import CUDA functions
 from fbpic.cuda_utils import cuda_installed