Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of CPU multi-threading #80

Merged
merged 41 commits into from
Jul 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9ae48f3
Initial CPU multi-threading implementation
MKirchen Jul 14, 2017
497c55a
Initial CPU multi-threading implementation (part 2)
MKirchen Jul 14, 2017
8f2e3ff
Fix pyflakes errors
MKirchen Jul 14, 2017
380b326
Print number of threads along with number of MPI procs
RemiLehe Jul 15, 2017
af15196
Swapped the order of global arrays + removed thread-local arrays
RemiLehe Jul 16, 2017
1e252be
Fix pyflakes errors
RemiLehe Jul 16, 2017
f8536d7
Merge pull request #82 from RemiLehe/better_thread_scaling
MKirchen Jul 16, 2017
e539347
Merge branch 'dev' into cpuprange
RemiLehe Jul 16, 2017
f9ce679
Corrected import pattern in laser antenna
RemiLehe Jul 16, 2017
eedf019
Fix thread index calculation
RemiLehe Jul 16, 2017
3a5f0f0
Fix automated tests
RemiLehe Jul 17, 2017
a3c2248
Implement parallel reduce
RemiLehe Jul 17, 2017
da28bbe
Added docstring to the function
RemiLehe Jul 17, 2017
1d9c4a7
Merge pull request #83 from RemiLehe/parallel_reduction
MKirchen Jul 17, 2017
f70cca1
Added cubic deposition functions
MKirchen Jul 17, 2017
2053e1f
Adapted particles.py for cubic prange deposition
MKirchen Jul 17, 2017
392354d
Removed linear_non_atomic shape from uniform_rho test
MKirchen Jul 17, 2017
7055845
Corrected some bugs introduced in last commits
MKirchen Jul 17, 2017
0a22108
Fix cubic deposition and cubic gathering
MKirchen Jul 17, 2017
c2f75b3
Remove function signature in field methods
RemiLehe Jul 19, 2017
18486c1
Create threading_utils.py
RemiLehe Jul 19, 2017
fce6692
Check if threading is installed in main.py
RemiLehe Jul 19, 2017
b6c3584
Added threaded methods for the fields
RemiLehe Jul 19, 2017
0c52a62
Added parallel capability for grid methods
RemiLehe Jul 19, 2017
8458a0d
Removed threaded push methods
RemiLehe Jul 19, 2017
6317aa3
Corrected push_x's return
RemiLehe Jul 19, 2017
c817b44
Correct push_p and push_x with return function
RemiLehe Jul 19, 2017
f1f2ab2
Give the right threading flag to particles
RemiLehe Jul 19, 2017
54bd6f0
Threaded the routines that convert from p/m to r/t components
RemiLehe Jul 19, 2017
5d2c7e5
Remove the flag `use_threading` as an input argument
RemiLehe Jul 20, 2017
62e37c5
Correct pyflakes errors
RemiLehe Jul 20, 2017
076b668
Thread the shifting of the grid in spectral space
RemiLehe Jul 20, 2017
0e654a1
Fix the threaded shift function
RemiLehe Jul 20, 2017
11ce49d
Remove arguments nthreads
RemiLehe Jul 20, 2017
ffe436c
Merge pull request #89 from RemiLehe/threaded_grids
MKirchen Jul 20, 2017
93149c0
Merge pull request #91 from RemiLehe/thread_shift_window
MKirchen Jul 20, 2017
77be6cf
Modified import structure of the prange function
RemiLehe Jul 20, 2017
7772819
Replace line endings to unix style
RemiLehe Jul 22, 2017
afd5d3b
Replace tx_chunks by an array
RemiLehe Jul 22, 2017
d35a7c1
Changes in variable names and docstring
RemiLehe Jul 22, 2017
6204585
Removed all mentions of linear_non_atomic
RemiLehe Jul 22, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 56 additions & 72 deletions fbpic/boundaries/moving_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scipy.constants import c
from fbpic.particles import Particles
from fbpic.lpa_utils.boosted_frame import BoostConverter
from fbpic.threading_utils import njit_parallel, prange
# Check if CUDA is available, then import CUDA functions
from fbpic.cuda_utils import cuda_installed
if cuda_installed:
Expand Down Expand Up @@ -318,86 +319,36 @@ def shift_spect_grid( self, grid, n_move,
"""
if grid.use_cuda:
shift = grid.d_field_shift
# Get a 2D CUDA grid of the size of the grid
tpb, bpg = cuda_tpb_bpg_2d( grid.Ep.shape[0], grid.Ep.shape[1] )
# Shift all the fields on the GPU
self.shift_spect_field_gpu( grid.Ep, shift, n_move )
self.shift_spect_field_gpu( grid.Em, shift, n_move )
self.shift_spect_field_gpu( grid.Ez, shift, n_move )
self.shift_spect_field_gpu( grid.Bp, shift, n_move )
self.shift_spect_field_gpu( grid.Bm, shift, n_move )
self.shift_spect_field_gpu( grid.Bz, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Ep, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Em, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Ez, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Bp, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Bm, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Bz, shift, n_move )
if shift_rho:
self.shift_spect_field_gpu( grid.rho_prev, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.rho_prev, shift, n_move )
if shift_currents:
self.shift_spect_field_gpu( grid.Jp, shift, n_move )
self.shift_spect_field_gpu( grid.Jm, shift, n_move )
self.shift_spect_field_gpu( grid.Jz, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Jp, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Jm, shift, n_move )
shift_spect_array_gpu[tpb, bpg]( grid.Jz, shift, n_move )
else:
shift = grid.field_shift
# Shift all the fields on the CPU
self.shift_spect_field( grid.Ep, shift, n_move )
self.shift_spect_field( grid.Em, shift, n_move )
self.shift_spect_field( grid.Ez, shift, n_move )
self.shift_spect_field( grid.Bp, shift, n_move )
self.shift_spect_field( grid.Bm, shift, n_move )
self.shift_spect_field( grid.Bz, shift, n_move )
shift_spect_array_cpu( grid.Ep, shift, n_move )
shift_spect_array_cpu( grid.Em, shift, n_move )
shift_spect_array_cpu( grid.Ez, shift, n_move )
shift_spect_array_cpu( grid.Bp, shift, n_move )
shift_spect_array_cpu( grid.Bm, shift, n_move )
shift_spect_array_cpu( grid.Bz, shift, n_move )
if shift_rho:
self.shift_spect_field( grid.rho_prev, shift, n_move )
shift_spect_array_cpu( grid.rho_prev, shift, n_move )
if shift_currents:
self.shift_spect_field( grid.Jp, shift, n_move )
self.shift_spect_field( grid.Jm, shift, n_move )
self.shift_spect_field( grid.Jz, shift, n_move )

def shift_spect_field( self, field_array, shift_factor, n_move ):
"""
Shift the field 'field_array' by n_move cells.
This is done in spectral space and corresponds to multiplying the
fields with the factor exp(i*kz_true*dz)**n_move .
(Typically n_move is positive, and the fields are shifted backwards)

Parameters
----------
field_array: 2darray of complexs
Contains the value of the fields, and is modified by
this function

shift_factor: 1darray of complexs
Contains the shift array, that is multiplied to the fields in
spectral space to shift them by one cell in spatial space
( exp(i*kz_true*dz) )

n_move: int
The number of cells by which the grid should be shifted
"""
# Multiply with (shift_factor*sign(n_move))**n_move
field_array *= ( shift_factor[:, np.newaxis] )**n_move

def shift_spect_field_gpu( self, field_array, shift_factor, n_move):
"""
Shift the field 'field_array' by n_move cells on the GPU.
This is done in spectral space and corresponds to multiplying the
fields with the factor exp(i*kz_true*dz)**n_move .
(Typically n_move is positive, and the fields are shifted backwards)

Parameters
----------
field_array: 2darray of complexs
Contains the value of the fields, and is modified by
this function

shift_factor: 1darray of complexs
Contains the shift array, that is multiplied to the fields in
spectral space to shift them by one cell in spatial space
( exp(i*kz_true*dz) )

n_move: int
The number of cells by which the grid should be shifted
"""
# Get a 2D CUDA grid of the size of the grid
dim_grid_2d, dim_block_2d = cuda_tpb_bpg_2d(
field_array.shape[0], field_array.shape[1] )
# Shift the field array in place
shift_spect_array_gpu[dim_grid_2d, dim_block_2d](
field_array, shift_factor, n_move)
shift_spect_array_cpu( grid.Jp, shift, n_move )
shift_spect_array_cpu( grid.Jm, shift, n_move )
shift_spect_array_cpu( grid.Jz, shift, n_move )

def shift_interp_grid( self, grid, n_move,
shift_rho=True, shift_currents=False ):
Expand Down Expand Up @@ -513,6 +464,39 @@ def shift_interp_field_gpu( self, field_array, n_move):
# Return the new shifted field array
return( field_array )

@njit_parallel
def shift_spect_array_cpu( field_array, shift_factor, n_move ):
"""
Shift the field 'field_array' by n_move cells on CPU.
This is done in spectral space and corresponds to multiplying the
fields with the factor exp(i*kz_true*dz)**n_move .

Parameters
----------
field_array: 2darray of complexs
Contains the value of the fields, and is modified by
this function

shift_factor: 1darray of complexs
Contains the shift array, that is multiplied to the fields in
spectral space to shift them by one cell in spatial space
( exp(i*kz_true*dz) )

n_move: int
The number of cells by which the grid should be shifted
"""
Nz, Nr = field_array.shape

# Loop over the 2D array (in parallel over z if threading is enabled)
for iz in prange( Nz ):
power_shift = shift_factor[iz]
# Calculate the shift factor (raising to the power n_move)
for i in range(1,n_move):
power_shift *= shift_factor[iz]
# Shift fields backwards
for ir in range( Nr ):
field_array[iz, ir] *= power_shift

if cuda_installed:

@cuda.jit('void(complex128[:,:], complex128[:,:], int32)')
Expand Down
39 changes: 19 additions & 20 deletions fbpic/fields/numba_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,18 @@
This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
It defines the optimized fields methods that use numba on a CPU
"""
import numba
from scipy.constants import c, epsilon_0, mu_0
c2 = c**2
from fbpic.threading_utils import njit_parallel, prange

@numba.jit('void(complex128[:,:], complex128[:,:], \
complex128[:,:], complex128[:,:], complex128[:,:], \
float64[:,:], float64[:,:], float64[:,:], \
float64, int32, int32)')
@njit_parallel
def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
kz, kr, inv_k2, inv_dt, Nz, Nr ):
"""
Correct the currents in spectral space, using the standard pstad
"""
# Loop over the 2D grid
for iz in range(Nz):
# Loop over the 2D grid (parallel in z, if threading is installed)
for iz in prange(Nz):
for ir in range(Nr):

# Calculate the intermediate variable F
Expand All @@ -33,13 +30,9 @@ def numba_correct_currents_standard( rho_prev, rho_next, Jp, Jm, Jz,
Jm[iz, ir] += -0.5 * kr[iz, ir] * F
Jz[iz, ir] += -1.j * kz[iz, ir] * F

@numba.jit('void(complex128[:,:], complex128[:,:], complex128[:,:], \
complex128[:,:], complex128[:,:], complex128[:,:], \
complex128[:,:], complex128[:,:], complex128[:,:], \
complex128[:,:], complex128[:,:], \
float64[:,:], float64[:,:], float64[:,:], \
float64[:,:], float64[:,:], float64[:,:], float64[:,:], float64, \
int8, int32, int32)')
return

@njit_parallel
def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
rho_prev, rho_next,
rho_prev_coef, rho_next_coef, j_coef,
Expand All @@ -50,8 +43,8 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,

See the documentation of SpectralGrid.push_eb_with
"""
# Loop over the 2D grid
for iz in range(Nz):
# Loop over the 2D grid (parallel in z, if threading is installed)
for iz in prange(Nz):
for ir in range(Nr):

# Save the electric fields, since it is needed for the B push
Expand Down Expand Up @@ -106,7 +99,9 @@ def numba_push_eb_standard( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
+ j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
+ 1.j*kr[iz, ir]*Jm[iz, ir] )

@numba.jit
return

@njit_parallel
def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
kz, kr, inv_k2,
j_corr_coef, T_eb, T_cc,
Expand All @@ -115,8 +110,8 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
Correct the currents in spectral space, using the assumption
of comoving currents
"""
# Loop over the 2D grid
for iz in range(Nz):
# Loop over the 2D grid (parallel in z, if threading is installed)
for iz in prange(Nz):
for ir in range(Nr):

# Calculate the intermediate variable F
Expand All @@ -130,7 +125,9 @@ def numba_correct_currents_comoving( rho_prev, rho_next, Jp, Jm, Jz,
Jm[iz, ir] += -0.5 * kr[iz, ir] * F
Jz[iz, ir] += -1.j * kz[iz, ir] * F

@numba.jit
return

@njit_parallel
def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
rho_prev, rho_next,
rho_prev_coef, rho_next_coef, j_coef,
Expand Down Expand Up @@ -207,3 +204,5 @@ def numba_push_eb_comoving( Ep, Em, Ez, Bp, Bm, Bz, Jp, Jm, Jz,
+ 1.j*kr[iz, ir]*Em_old ) \
+ j_coef[iz, ir]*( 1.j*kr[iz, ir]*Jp[iz, ir] \
+ 1.j*kr[iz, ir]*Jm[iz, ir] )

return
29 changes: 16 additions & 13 deletions fbpic/fields/spectral_transform/spectral_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .hankel import DHT
from .fourier import FFT

from .threading_methods import numba_rt_to_pm, numba_pm_to_rt
# Check if CUDA is available, then import CUDA functions
from fbpic.cuda_utils import cuda_installed
if cuda_installed:
Expand Down Expand Up @@ -139,12 +140,13 @@ def spect2interp_vect( self, spect_array_p, spect_array_m,
self.spect_buffer_r, self.spect_buffer_t )
else :
# Combine them on the CPU
# (It is important to write the affectation in the following way,
# since self.spect_buffer_p and self.spect_buffer_r actually point
# to the same object, for memory economy)
self.spect_buffer_r[:,:], self.spect_buffer_t[:,:] = \
( self.spect_buffer_p + self.spect_buffer_m), \
1.j*( self.spect_buffer_p - self.spect_buffer_m)
# (self.spect_buffer_r and self.spect_buffer_t are
# passed in the following line, in order to make things
# explicit, but they actually point to the same object
# as self.spect_buffer_p, self.spect_buffer_m,
# for economy of memory)
numba_pm_to_rt( self.spect_buffer_p, self.spect_buffer_m,
self.spect_buffer_r, self.spect_buffer_t )

# Finally perform the FFT (along axis 0, which corresponds to z)
self.fft.inverse_transform( self.spect_buffer_r, interp_array_r )
Expand Down Expand Up @@ -205,13 +207,14 @@ def interp2spect_vect( self, interp_array_r, interp_array_t,
self.spect_buffer_r, self.spect_buffer_t,
self.spect_buffer_p, self.spect_buffer_m )
else :
# Combine them on the CPU
# (It is important to write the affectation in the following way,
# since self.spect_buffer_p and self.spect_buffer_r actually point
# to the same object, for memory economy.)
self.spect_buffer_p[:,:], self.spect_buffer_m[:,:] = \
0.5*( self.spect_buffer_r - 1.j*self.spect_buffer_t ), \
0.5*( self.spect_buffer_r + 1.j*self.spect_buffer_t )
# Combine them on the GPU
# (self.spect_buffer_p and self.spect_buffer_m are
# passed in the following line, in order to make things
# explicit, but they actually point to the same object
# as self.spect_buffer_r, self.spect_buffer_t,
# for economy of memory)
numba_rt_to_pm( self.spect_buffer_r, self.spect_buffer_t,
self.spect_buffer_p, self.spect_buffer_m )

# Perform the inverse DHT (along axis -1, which corresponds to r)
self.dhtp.transform( self.spect_buffer_p, spect_array_p )
Expand Down
58 changes: 58 additions & 0 deletions fbpic/fields/spectral_transform/threading_methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2016, FBPIC contributors
# Authors: Remi Lehe, Manuel Kirchen
# License: 3-Clause-BSD-LBNL
"""
This file is part of the Fourier-Bessel Particle-In-Cell code (FB-PIC)
It defines a set of functions that are useful when converting the
fields from interpolation grid to the spectral grid and vice-versa
"""
from fbpic.threading_utils import prange, njit_parallel

# ----------------------------------------------------
# Functions that combine components in spectral space
# ----------------------------------------------------

@njit_parallel
def numba_rt_to_pm( buffer_r, buffer_t, buffer_p, buffer_m ) :
"""
Combine the arrays buffer_r and buffer_t to produce the
arrays buffer_p and buffer_m, according to the rules of
the Fourier-Hankel decomposition (see associated paper)
"""
Nz, Nr = buffer_r.shape

# Loop over the 2D grid (parallel in z, if threading is installed)
for iz in prange(Nz):
for ir in range(Nr):

# Use intermediate variables, as the arrays
# buffer_r and buffer_t may actually point to the same
# object as buffer_p and buffer_m, for economy of memory
value_r = buffer_r[iz, ir]
value_t = buffer_t[iz, ir]
# Combine the values
buffer_p[iz, ir] = 0.5*( value_r - 1.j*value_t )
buffer_m[iz, ir] = 0.5*( value_r + 1.j*value_t )


@njit_parallel
def numba_pm_to_rt( buffer_p, buffer_m, buffer_r, buffer_t ) :
"""
Combine the arrays buffer_p and buffer_m to produce the
arrays buffer_r and buffer_t, according to the rules of
the Fourier-Hankel decomposition (see associated paper)
"""
Nz, Nr = buffer_p.shape

# Loop over the 2D grid (parallel in z, if threading is installed)
for iz in prange(Nz):
for ir in range(Nr):

# Use intermediate variables, as the arrays
# buffer_r and buffer_t may actually point to the same
# object as buffer_p and buffer_m, for economy of memory
value_p = buffer_p[iz, ir]
value_m = buffer_m[iz, ir]
# Combine the values
buffer_r[iz, ir] = ( value_p + value_m )
buffer_t[iz, ir] = 1.j*( value_p - value_m )
4 changes: 2 additions & 2 deletions fbpic/lpa_utils/laser/antenna.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from scipy.constants import e, c, epsilon_0, physical_constants
r_e = physical_constants['classical electron radius'][0]
from .profiles import gaussian_profile
from fbpic.particles.utility_methods import weights
from fbpic.particles.numba_methods import deposit_field_numba
from fbpic.particles.utilities.utility_methods import weights
from fbpic.particles.deposition.numba_methods import deposit_field_numba

# Check if CUDA is available, then import CUDA functions
from fbpic.cuda_utils import cuda_installed
Expand Down
Loading