Skip to content

Commit

Permalink
GPU acclelerated archetypes.get_best_archetype().
Browse files Browse the repository at this point in the history
Use batch rebin, transmission_Lyman, and calc_zchi2 operations.

Speed gains: without archetypes, redrock on 4 GPU / 4 CPU takes 14.8s
reported total run time, 7.3s of which is in the fine redshift scan.
Comparatively with 64 CPU and 0 GPU the base redrock runs in 40.0s
reported total run time with 6.7s spent in fine redshift scan.

Adding the base archetypes option (without per-camera or nearest
neighbor) raises CPU times to 63.8s overall and 28.1s in fine z scan so
about 60% increase overall and about 4x slower in fine z scan.

With the new code the "batch" CPU mode slightly improves this to 60.0s
and 24.2s.

With the new GPU code, it runs on 4 GPU / 4 CPU in 22.8s total and 14.3s
for fine z scan, a 50% overall increase but only 2x speed increase in
the fine z scan, a big improvement from CPU times.

Also updated transmission_Lyman to return None when given scalar z to
match behavior when given an array of redshifts, in the case where the
wavelength range is not affected by Lyman transmission.  There is no
need in this case to calculate an array of all ones and then
additionally multiply the rebinned data by it.

Have yet to update --per-camera and -n_nearest options so right now
there are placeholders so that it does not crash that simply loop over
the existing CPU-mode logic.  These will be updated shortly.
  • Loading branch information
craigwarner-ufastro committed Sep 19, 2023
1 parent d0395f5 commit 055aa12
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 26 deletions.
113 changes: 93 additions & 20 deletions py/redrock/archetypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
from scipy.interpolate import interp1d
import scipy.special

from .zscan import calc_zchi2_one
from .zscan import calc_zchi2_one, calc_zchi2_batch

from .rebin import trapz_rebin

from .utils import transmission_Lyman

from .zscan import per_camera_coeff_with_least_square

from .zscan import per_camera_coeff_with_least_square, per_camera_coeff_with_least_square_batch

class Archetype():
"""Class to store all different archetypes from the same spectype.
Expand Down Expand Up @@ -47,6 +46,8 @@ def __init__(self, filename):
self.wave = np.asarray(hdr['CRVAL1'] + hdr['CDELT1']*np.arange(self.flux.shape[1]))
if hdr['LOGLAM']:
self.wave = 10**self.wave
self.minwave = self.wave[0]
self.maxwave = self.wave[-1]

self._archetype = {}
self._archetype['INTERP'] = np.array([None]*self._narch)
Expand All @@ -64,6 +65,40 @@ def rebin_template(self,index,z,dwave,trapz=True):
else:
return {hs:self._archetype['INTERP'][index](wave/(1.+z)) for hs, wave in dwave.items()}

def rebin_template_batch(self,z,dwave,trapz=True,dedges=None,use_gpu=False):
"""
"""
if (use_gpu):
import cupy as cp
xmin = self.minwave
xmax = self.maxwave
self.flux = cp.asarray(self.flux)
self.wave = cp.asarray(self.wave)

minedge = None
maxedge = None
result = dict()
if (trapz and use_gpu and dedges is not None):
#Use GPU mode with bin edges already calculated
for hs, edges in dedges.items():
#Check if edges is a 1-d array or a tuple also containing scalar min/max values
if type(edges) is tuple:
(edges, minedge, maxedge) = edges
result[hs] = trapz_rebin(self.wave, self.flux, edges=edges, use_gpu=use_gpu, myz=cp.array([z]), xmin=xmin, xmax=xmax, edge_min=minedge, edge_max=maxedge)[0,:,:]
return result
if trapz:
#Use batch mode of trapz_rebin
return {hs:trapz_rebin((1.+z)*self.wave, self.flux, wave, use_gpu=use_gpu) for hs, wave in dwave.items()}
else:
for hs, wave in dwave.items():
result[hs] = np.empty((len(wave), self._narch))
for i in range(self._narch):
result[hs][:,i] = self._archetype['INTERP'][i](wave/(1.+z))
if (use_gpu):
result[hs] = cp.asarray(result[hs])
#return {hs:self._archetype['INTERP'](wave/(1.+z)) for hs, wave in dwave.items()}
return result

def eval(self, subtype, dwave, coeff, wave, z):
"""
Expand Down Expand Up @@ -112,12 +147,19 @@ def nearest_neighbour_model(self, spectra,weights,flux,wflux,dwave,z, legendre,
nleg = legendre[list(legendre.keys())[0]].shape[0]
iBest = np.argsort(zzchi2)[0:n_nearest]
binned_best = self.rebin_template(iBest[0], z, dwave,trapz=True)
binned_best = { hs:trans[hs]*binned_best[hs] for hs, w in dwave.items() }
for hs, w in dwave.items():
if (trans[hs] is not None):
binned_best[hs] *= trans[hs]
#binned_best = { hs:trans[hs]*binned_best[hs] for hs, w in dwave.items() }
tdata = {hs: binned_best[hs][:,None] for hs, wave in dwave.items()}
if len(iBest)>1:
for ib in iBest[1:]:
#print ("IB", ib)
binned = self.rebin_template(ib, z, dwave,trapz=True)
binned = { hs:trans[hs]*binned[hs] for hs, w in dwave.items() }
for hs, w in dwave.items():
if (trans[hs] is not None):
binned[hs] *= trans[hs]
#binned = { hs:trans[hs]*binned[hs] for hs, w in dwave.items() }
tdata = { hs:np.append(tdata[hs], binned[hs][:,None], axis=1) for hs, wave in dwave.items()}
if nleg>0:
tdata = { hs:np.append(tdata[hs],legendre[hs].transpose(), axis=1 ) for hs, wave in dwave.items() }
Expand All @@ -134,7 +176,7 @@ def nearest_neighbour_model(self, spectra,weights,flux,wflux,dwave,z, legendre,
return zzchi2, zzcoeff, self._rrtype+':::%s'%(fsstype)


def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_camera, n_nearest):
def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_camera, n_nearest, dedges=None, trans=None, use_gpu=False):

"""Get the best archetype for the given redshift and spectype.
Expand All @@ -148,6 +190,9 @@ def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_cam
legendre (dict): legendre polynomial
per_camera (bool): True if fitting needs to be done in each camera
n_nearest (int): number of nearest neighbours to be used in chi2 space (including best archetype)
dedges (dict): in GPU mode, use pre-computed dict of wavelength bin edges, already on GPU
trans (dict): pass previously calcualated Lyman transmission instead of recalculating
use_gpu (bool): use GPU or not
Returns:
chi2 (float): chi2 of best archetype
Expand All @@ -172,22 +217,50 @@ def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_cam
#TODO: return best fit model as well
#zzmodel = np.zeros((self._narch, obs_wave.size), dtype=np.float64)

trans = { hs:transmission_Lyman(z,w) for hs, w in dwave.items() }

for i in range(self._narch):
binned = self.rebin_template(i, z, dwave,trapz=True)
binned = { hs:trans[hs]*binned[hs] for hs, w in dwave.items() }
if nleg>0:
tdata = { hs:np.append(binned[hs][:,None],legendre[hs].transpose(), axis=1 ) for hs, wave in dwave.items() }
else:
tdata = {hs:binned[hs][:,None] for hs, wave in dwave.items()}
if per_camera:
zzchi2[i], zzcoeff[i]= per_camera_coeff_with_least_square(spectra, tdata, nleg, method='bvls', n_nbh=1)
if (trans is None):
#Calculate Lyman transmission if not passed as dict
trans = { hs:transmission_Lyman(z,w, use_gpu=False) for hs, w in dwave.items() }
else:
#Use previously calculated Lyman transmission
for hs in trans:
if (trans[hs] is not None):
trans[hs] = trans[hs][0,:]

#Select np or cp for operations as arrtype
if (use_gpu):
import cupy as cp
arrtype = cp
else:
arrtype = np
#Rebin in batch
binned = self.rebin_template_batch(z, dwave, trapz=True, dedges=dedges, use_gpu=use_gpu)

tdata = dict()
nbasis = 1
for hs, wave in dwave.items():
if (trans[hs] is not None):
#Only multiply if trans[hs] is not None
binned[hs] *= arrtype.asarray(trans[hs][:,None])
#Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
if nleg > 0:
tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2)
else:
zzchi2[i], zzcoeff[i] = calc_zchi2_one(spectra, weights, flux, wflux, tdata)

tdata[hs] = binned[hs].transpose()[:,:,None]
nbasis = tdata[hs].shape[2]
if per_camera:
#Batch placeholder that right now loops over each arch but will be GPU accelerated
(zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(spectra, tdata, nleg, self._narch, method='bvls', n_nbh=1, use_gpu=use_gpu)
else:
(zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, self._narch, nbasis, use_gpu=use_gpu)

if n_nearest is not None:
best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(spectra,weights,flux,wflux,dwave,z, legendre, n_nearest, zzchi2, trans, per_camera)
if (use_gpu):
##PLACEHOLDER - copy flux and wave back to CPU so that nearest neighbor will work until that algorithm is GPU-ized
self.flux = self.flux.get()
self.wave = self.wave.get()
best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(spectra,weights.get(),flux.get(),wflux.get(),dwave,z, legendre, n_nearest, zzchi2, trans, per_camera)
else:
best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(spectra,weights,flux,wflux,dwave,z, legendre, n_nearest, zzchi2, trans, per_camera)
return best_chi2, best_coeff, best_fulltype
else:
iBest = np.argmin(zzchi2)
Expand Down
16 changes: 10 additions & 6 deletions py/redrock/fitz.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def minfit(x, y):


def legendre_calculate(nleg, dwave):

wave = np.concatenate([ w for w in dwave.values() ])
wmin = wave.min()
wmax = wave.max()
Expand Down Expand Up @@ -151,10 +150,6 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
# Build dictionary of wavelength grids
dwave = { s.wavehash:s.wave for s in spectra }

if not archetype is None:
legendre = legendre_calculate(deg_legendre, dwave=dwave)


(weights, flux, wflux) = spectral_data(spectra)

if (use_gpu):
Expand All @@ -166,6 +161,9 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
gpuedges = { s.wavehash:(s.gpuedges, s.minedge, s.maxedge) for s in spectra }
gpudwave = { s.wavehash:s.gpuwave for s in spectra }

if not archetype is None:
legendre = legendre_calculate(deg_legendre, dwave=dwave)

results = list()
#Define nz here instead of hard-coding length 15 and then defining nz as
#length of zz list
Expand Down Expand Up @@ -226,6 +224,7 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
i = min(max(np.argmin(zzchi2),1), len(zz)-2)
zmin, sigma, chi2min, zwarn = minfit(zz[i-1:i+2], zzchi2[i-1:i+2])

trans = dict()
try:
#Calculate xmin and xmax from template and pass as scalars
xmin = template.minwave*(1+zmin)
Expand All @@ -242,6 +241,7 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
binned[k] = binned[k].get()
#Use CPU always
T = transmission_Lyman(np.array([zmin]),dwave[k], use_gpu=False)
trans[k] = T
if (T is None):
#Return value of None means that wavelenght regime
#does not overlap Lyman transmission - continue here
Expand Down Expand Up @@ -289,7 +289,11 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
chi2=chi2min, zz=zz, zzchi2=zzchi2,
coeff=coeff))
else:
chi2min, coeff, fulltype = archetype.get_best_archetype(spectra,weights,flux,wflux,dwave,zbest,legendre, per_camera, n_nearest)
if (use_gpu):
chi2min, coeff, fulltype = archetype.get_best_archetype(spectra,gpuweights,gpuflux,gpuwflux,dwave,zbest,legendre, per_camera, n_nearest, dedges=gpuedges, trans=trans, use_gpu=use_gpu)
del trans
else:
chi2min, coeff, fulltype = archetype.get_best_archetype(spectra,weights,flux,wflux,dwave,zbest,legendre, per_camera, n_nearest, use_gpu=use_gpu)

results.append(dict(z=zbest, zerr=zerr, zwarn=zwarn,
chi2=chi2min, zz=zz, zzchi2=zzchi2,
Expand Down
5 changes: 5 additions & 0 deletions py/redrock/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,11 @@ def transmission_Lyman(zObj,lObs, use_gpu=False):
min_wave = 0
if (np.isscalar(zObj)):
#zObj is a float
min_wave = lObs.min()/(1+zObj)
if (min_wave > Lyman_series['Lya']['line']):
#Return None if wavelength range doesn't overlap with Lyman series
#No need to perform any calculations in this case
return None
lRF = lObs/(1.+zObj)
else:
if (len(zObj) == 0):
Expand Down
15 changes: 15 additions & 0 deletions py/redrock/zscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,21 @@ def per_camera_coeff_with_least_square(spectra, tdata, nleg, method=None, n_nbh=
#print(f'{time.time()-start} [sec] took for per camera BVLS method\n')
return zchi2, coeff

def per_camera_coeff_with_least_square_batch(spectra, tdata, nleg, narch, method=None, n_nbh=None, use_gpu=False):
### PLACEHOLDER for algorithm that will be GPU accelerated
ncam = 3 # number of cameras in DESI: b, r, z
zzchi2 = np.zeros(narch, dtype=np.float64)
zzcoeff = np.zeros((narch, 1+ncam*(nleg)), dtype=np.float64)

tdata_one = dict()
for i in range(narch):
for hs in tdata:
tdata_one[hs] = tdata[hs][i,:,:]
if (use_gpu):
tdata_one[hs] = tdata_one[hs].get()
zzchi2[i], zzcoeff[i]= per_camera_coeff_with_least_square(spectra, tdata_one, nleg, method='bvls', n_nbh=1)
return zzchi2, zzcoeff

def batch_dot_product_sparse(spectra, tdata, nz, use_gpu):
"""Calculate a batch dot product of the 3 sparse matrices in spectra
with every template in tdata. Sparse matrix libraries are used
Expand Down

0 comments on commit 055aa12

Please sign in to comment.