GPU acclelerated archetypes.get_best_archetype().

Use batch rebin, transmission_Lyman, and calc_zchi2 operations. Speed gains: without archetypes, redrock on 4 GPU / 4 CPU takes 14.8s reported total run time, 7.3s of which is in the fine redshift scan. Comparatively with 64 CPU and 0 GPU the base redrock runs in 40.0s reported total run time with 6.7s spent in fine redshift scan. Adding the base archetypes option (without per-camera or nearest neighbor) raises CPU times to 63.8s overall and 28.1s in fine z scan so about 60% increase overall and about 4x slower in fine z scan. With the new code the "batch" CPU mode slightly improves this to 60.0s and 24.2s. With the new GPU code, it runs on 4 GPU / 4 CPU in 22.8s total and 14.3s for fine z scan, a 50% overall increase but only 2x speed increase in the fine z scan, a big improvement from CPU times. Also updated transmission_Lyman to return None when given scalar z to match behavior when given an array of redshifts, in the case where the wavelength range is not affected by Lyman transmission. There is no need in this case to calculate an array of all ones and then additionally multiply the rebinned data by it. Have yet to update --per-camera and -n_nearest options so right now there are placeholders so that it does not crash that simply loop over the existing CPU-mode logic. These will be updated shortly.
desihub · Sep 19, 2023 · 055aa12 · 055aa12
1 parent d0395f5
commit 055aa12
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 26 deletions.
diff --git a/py/redrock/archetypes.py b/py/redrock/archetypes.py
@@ -11,14 +11,13 @@
 from scipy.interpolate import interp1d
 import scipy.special
 
-from .zscan import calc_zchi2_one
+from .zscan import calc_zchi2_one, calc_zchi2_batch
 
 from .rebin import trapz_rebin
 
 from .utils import transmission_Lyman
 
-from .zscan import per_camera_coeff_with_least_square
-
+from .zscan import per_camera_coeff_with_least_square, per_camera_coeff_with_least_square_batch
 
 class Archetype():
     """Class to store all different archetypes from the same spectype.
@@ -47,6 +46,8 @@ def __init__(self, filename):
         self.wave = np.asarray(hdr['CRVAL1'] + hdr['CDELT1']*np.arange(self.flux.shape[1]))
         if hdr['LOGLAM']:
             self.wave = 10**self.wave
+        self.minwave = self.wave[0]
+        self.maxwave = self.wave[-1]
 
         self._archetype = {}
         self._archetype['INTERP'] = np.array([None]*self._narch)
@@ -64,6 +65,40 @@ def rebin_template(self,index,z,dwave,trapz=True):
         else:
             return {hs:self._archetype['INTERP'][index](wave/(1.+z)) for hs, wave in dwave.items()}
 
+    def rebin_template_batch(self,z,dwave,trapz=True,dedges=None,use_gpu=False):
+        """
+        """
+        if (use_gpu):
+            import cupy as cp
+            xmin = self.minwave
+            xmax = self.maxwave
+            self.flux = cp.asarray(self.flux)
+            self.wave = cp.asarray(self.wave)
+
+        minedge = None
+        maxedge = None
+        result = dict()
+        if (trapz and use_gpu and dedges is not None):
+            #Use GPU mode with bin edges already calculated
+            for hs, edges in dedges.items():
+                #Check if edges is a 1-d array or a tuple also containing scalar min/max values
+                if type(edges) is tuple:
+                    (edges, minedge, maxedge) = edges
+                result[hs] = trapz_rebin(self.wave, self.flux, edges=edges, use_gpu=use_gpu, myz=cp.array([z]), xmin=xmin, xmax=xmax, edge_min=minedge, edge_max=maxedge)[0,:,:]
+            return result
+        if trapz:
+            #Use batch mode of trapz_rebin
+            return {hs:trapz_rebin((1.+z)*self.wave, self.flux, wave, use_gpu=use_gpu) for hs, wave in dwave.items()}
+        else:
+            for hs, wave in dwave.items():
+                result[hs] = np.empty((len(wave), self._narch))
+                for i in range(self._narch):
+                    result[hs][:,i] = self._archetype['INTERP'][i](wave/(1.+z))
+                if (use_gpu):
+                    result[hs] = cp.asarray(result[hs])
+            #return {hs:self._archetype['INTERP'](wave/(1.+z)) for hs, wave in dwave.items()}
+        return result
+
     def eval(self, subtype, dwave, coeff, wave, z):
         """
 
@@ -112,12 +147,19 @@ def nearest_neighbour_model(self, spectra,weights,flux,wflux,dwave,z, legendre,
         nleg = legendre[list(legendre.keys())[0]].shape[0]
         iBest = np.argsort(zzchi2)[0:n_nearest]
         binned_best = self.rebin_template(iBest[0], z, dwave,trapz=True)
-        binned_best = { hs:trans[hs]*binned_best[hs] for hs, w in dwave.items() }
+        for hs, w in dwave.items():
+            if (trans[hs] is not None):
+                binned_best[hs] *= trans[hs]
+        #binned_best = { hs:trans[hs]*binned_best[hs] for hs, w in dwave.items() }
         tdata = {hs: binned_best[hs][:,None] for hs, wave in dwave.items()}
         if len(iBest)>1:
             for ib in iBest[1:]:
+                #print ("IB", ib)
                 binned = self.rebin_template(ib, z, dwave,trapz=True)
-                binned = { hs:trans[hs]*binned[hs] for hs, w in dwave.items() }
+                for hs, w in dwave.items():
+                    if (trans[hs] is not None):
+                        binned[hs] *= trans[hs]
+                #binned = { hs:trans[hs]*binned[hs] for hs, w in dwave.items() }
                 tdata = { hs:np.append(tdata[hs], binned[hs][:,None], axis=1) for hs, wave in dwave.items()}
             if nleg>0:
                 tdata = { hs:np.append(tdata[hs],legendre[hs].transpose(), axis=1 ) for hs, wave in dwave.items() }
@@ -134,7 +176,7 @@ def nearest_neighbour_model(self, spectra,weights,flux,wflux,dwave,z, legendre,
             return zzchi2, zzcoeff, self._rrtype+':::%s'%(fsstype)
 
 
-    def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_camera, n_nearest):
+    def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_camera, n_nearest, dedges=None, trans=None, use_gpu=False):
 
         """Get the best archetype for the given redshift and spectype.
 
@@ -148,6 +190,9 @@ def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_cam
             legendre (dict): legendre polynomial
             per_camera (bool): True if fitting needs to be done in each camera
             n_nearest (int): number of nearest neighbours to be used in chi2 space (including best archetype)
+            dedges (dict): in GPU mode, use pre-computed dict of wavelength bin edges, already on GPU
+            trans (dict): pass previously calcualated Lyman transmission instead of recalculating
+            use_gpu (bool): use GPU or not
             
         Returns:
             chi2 (float): chi2 of best archetype
@@ -172,22 +217,50 @@ def get_best_archetype(self,spectra,weights,flux,wflux,dwave,z,legendre, per_cam
         #TODO: return best fit model as well
         #zzmodel = np.zeros((self._narch, obs_wave.size), dtype=np.float64)
 
-        trans = { hs:transmission_Lyman(z,w) for hs, w in dwave.items() }
-
-        for i in range(self._narch):
-            binned = self.rebin_template(i, z, dwave,trapz=True)
-            binned = { hs:trans[hs]*binned[hs] for hs, w in dwave.items() }
-            if nleg>0:
-                tdata = { hs:np.append(binned[hs][:,None],legendre[hs].transpose(), axis=1 ) for hs, wave in dwave.items() }
-            else:
-                tdata = {hs:binned[hs][:,None] for hs, wave in dwave.items()}
-            if per_camera:
-                zzchi2[i], zzcoeff[i]= per_camera_coeff_with_least_square(spectra, tdata, nleg, method='bvls', n_nbh=1)
+        if (trans is None):
+            #Calculate Lyman transmission if not passed as dict
+            trans = { hs:transmission_Lyman(z,w, use_gpu=False) for hs, w in dwave.items() }
+        else:
+            #Use previously calculated Lyman transmission
+            for hs in trans:
+                if (trans[hs] is not None):
+                    trans[hs] = trans[hs][0,:]
+
+        #Select np or cp for operations as arrtype
+        if (use_gpu):
+            import cupy as cp
+            arrtype = cp
+        else:
+            arrtype = np
+        #Rebin in batch
+        binned = self.rebin_template_batch(z, dwave, trapz=True, dedges=dedges, use_gpu=use_gpu)
+
+        tdata = dict()
+        nbasis = 1
+        for hs, wave in dwave.items():
+            if (trans[hs] is not None):
+                #Only multiply if trans[hs] is not None
+                binned[hs] *= arrtype.asarray(trans[hs][:,None])
+            #Create 3-d tdata with narch x nwave x nbasis where nbasis = 1+nleg
+            if nleg > 0:
+                tdata[hs] = arrtype.append(binned[hs].transpose()[:,:,None], arrtype.tile(arrtype.asarray(legendre[hs]).transpose()[None,:,:], (self._narch, 1, 1)), axis=2)
             else:
-                zzchi2[i], zzcoeff[i] = calc_zchi2_one(spectra, weights, flux, wflux, tdata)
-
+                tdata[hs] = binned[hs].transpose()[:,:,None]
+            nbasis = tdata[hs].shape[2]
+        if per_camera:
+            #Batch placeholder that right now loops over each arch but will be GPU accelerated
+            (zzchi2, zzcoeff) = per_camera_coeff_with_least_square_batch(spectra, tdata, nleg, self._narch, method='bvls', n_nbh=1, use_gpu=use_gpu)
+        else:
+            (zzchi2, zzcoeff) = calc_zchi2_batch(spectra, tdata, weights, flux, wflux, self._narch, nbasis, use_gpu=use_gpu)
+
         if n_nearest is not None:
-            best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(spectra,weights,flux,wflux,dwave,z, legendre, n_nearest, zzchi2, trans, per_camera) 
+            if (use_gpu):
+                ##PLACEHOLDER - copy flux and wave back to CPU so that nearest neighbor will work until that algorithm is GPU-ized
+                self.flux = self.flux.get()
+                self.wave = self.wave.get()
+                best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(spectra,weights.get(),flux.get(),wflux.get(),dwave,z, legendre, n_nearest, zzchi2, trans, per_camera)
+            else:
+                best_chi2, best_coeff, best_fulltype = self.nearest_neighbour_model(spectra,weights,flux,wflux,dwave,z, legendre, n_nearest, zzchi2, trans, per_camera)
             return best_chi2, best_coeff, best_fulltype
         else:
             iBest = np.argmin(zzchi2)

diff --git a/py/redrock/fitz.py b/py/redrock/fitz.py
@@ -109,7 +109,6 @@ def minfit(x, y):
 
 
 def legendre_calculate(nleg, dwave):
-
     wave = np.concatenate([ w for w in dwave.values() ])
     wmin = wave.min()
     wmax = wave.max()
@@ -151,10 +150,6 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
     # Build dictionary of wavelength grids
     dwave = { s.wavehash:s.wave for s in spectra }
 
-    if not archetype is None:
-        legendre = legendre_calculate(deg_legendre, dwave=dwave)
-
-
     (weights, flux, wflux) = spectral_data(spectra)
 
     if (use_gpu):
@@ -166,6 +161,9 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
         gpuedges = { s.wavehash:(s.gpuedges, s.minedge, s.maxedge) for s in spectra }
         gpudwave = { s.wavehash:s.gpuwave for s in spectra }
 
+    if not archetype is None:
+        legendre = legendre_calculate(deg_legendre, dwave=dwave)
+
     results = list()
     #Define nz here instead of hard-coding length 15 and then defining nz as
     #length of zz list
@@ -226,6 +224,7 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
         i = min(max(np.argmin(zzchi2),1), len(zz)-2)
         zmin, sigma, chi2min, zwarn = minfit(zz[i-1:i+2], zzchi2[i-1:i+2])
 
+        trans = dict()
         try:
             #Calculate xmin and xmax from template and pass as scalars 
             xmin = template.minwave*(1+zmin)
@@ -242,6 +241,7 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
                     binned[k] = binned[k].get()
                 #Use CPU always
                 T = transmission_Lyman(np.array([zmin]),dwave[k], use_gpu=False)
+                trans[k] = T
                 if (T is None):
                     #Return value of None means that wavelenght regime
                     #does not overlap Lyman transmission - continue here
@@ -289,7 +289,11 @@ def fitz(zchi2, redshifts, target, template, nminima=3, archetype=None, use_gpu=
                 chi2=chi2min, zz=zz, zzchi2=zzchi2,
                 coeff=coeff))
         else:
-            chi2min, coeff, fulltype = archetype.get_best_archetype(spectra,weights,flux,wflux,dwave,zbest,legendre, per_camera, n_nearest)
+            if (use_gpu):
+                chi2min, coeff, fulltype = archetype.get_best_archetype(spectra,gpuweights,gpuflux,gpuwflux,dwave,zbest,legendre, per_camera, n_nearest, dedges=gpuedges, trans=trans, use_gpu=use_gpu)
+                del trans
+            else:
+                chi2min, coeff, fulltype = archetype.get_best_archetype(spectra,weights,flux,wflux,dwave,zbest,legendre, per_camera, n_nearest, use_gpu=use_gpu)
 
             results.append(dict(z=zbest, zerr=zerr, zwarn=zwarn,
                 chi2=chi2min, zz=zz, zzchi2=zzchi2,

diff --git a/py/redrock/utils.py b/py/redrock/utils.py
@@ -255,6 +255,11 @@ def transmission_Lyman(zObj,lObs, use_gpu=False):
     min_wave = 0
     if (np.isscalar(zObj)):
         #zObj is a float
+        min_wave = lObs.min()/(1+zObj)
+        if (min_wave > Lyman_series['Lya']['line']):
+            #Return None if wavelength range doesn't overlap with Lyman series
+            #No need to perform any calculations in this case
+            return None
         lRF = lObs/(1.+zObj)
     else:
         if (len(zObj) == 0):

diff --git a/py/redrock/zscan.py b/py/redrock/zscan.py
@@ -347,6 +347,21 @@ def per_camera_coeff_with_least_square(spectra, tdata, nleg, method=None, n_nbh=
     #print(f'{time.time()-start} [sec] took for per camera BVLS method\n')
     return zchi2, coeff
 
+def per_camera_coeff_with_least_square_batch(spectra, tdata, nleg, narch, method=None, n_nbh=None, use_gpu=False):
+    ### PLACEHOLDER for algorithm that will be GPU accelerated
+    ncam = 3 # number of cameras in DESI: b, r, z
+    zzchi2 = np.zeros(narch, dtype=np.float64)
+    zzcoeff = np.zeros((narch,  1+ncam*(nleg)), dtype=np.float64)
+
+    tdata_one = dict()
+    for i in range(narch):
+        for hs in tdata:
+            tdata_one[hs] = tdata[hs][i,:,:]
+            if (use_gpu):
+                tdata_one[hs] = tdata_one[hs].get()
+        zzchi2[i], zzcoeff[i]= per_camera_coeff_with_least_square(spectra, tdata_one, nleg, method='bvls', n_nbh=1)
+    return zzchi2, zzcoeff
+
 def batch_dot_product_sparse(spectra, tdata, nz, use_gpu):
     """Calculate a batch dot product of the 3 sparse matrices in spectra
     with every template in tdata.  Sparse matrix libraries are used