Redshifting during training to reduce the array size before training.…

… Redshifting using log wavelength scale, and apodizing after this. Added commands `astrodash` and `dash` to setup.py so that running this will open the GUI. Saving old memory mapping method (in case arrays get too large again) as create_arrays_with_memory_mapping.py. No longer doing memory mapping in create_arrays.py because some of the augmentations steps that were causing large arrays have been moved to occur during training rather than before.
daniel-muthukrishna · Jul 6, 2018 · d273d75 · d273d75
1 parent 7129383
commit d273d75
Show file tree

Hide file tree

Showing 8 changed files with 523 additions and 40 deletions.
diff --git a/dash/classify_OzDES_runs.py b/dash/classify_OzDES_runs.py
@@ -58,7 +58,7 @@ def main(runDirectory, atelTextFile, saveMatchesFilename):
         print("{0:17} | {1:5} | {2:8} | {3:10} | {4:6} | {5:10} | {6:10}".format('_'.join([filenames[i].split('/')[-1].split('_')[0], filenames[i].split('/')[-1].split('_')[3]]) , redshifts[i], bestTypes[i][0], bestTypes[i][1], bestTypes[i][2], matchesFlag[i].replace(' matches',''), wikiClassifications[i]))
 
     # Plot one of the matches
-    classification.plot_with_gui(indexToPlot=7)
+    classification.plot_with_gui(indexToPlot=0)
 
 
 if __name__ == '__main__':

diff --git a/dash/create_and_save_all_data_files.py b/dash/create_and_save_all_data_files.py
@@ -17,9 +17,14 @@
     classifyHost = False
     minZ = 0.
     maxZ = 0.8
-    numOfRedshifts = 50
-    trainFraction = 1.0
+    redshiftDuringTraining = True
+    trainFraction = 1
     numTrainBatches = 2000000
+    # Do not change this unless we want to redshift before training.
+    numOfRedshifts = 1
+
+    if numOfRedshifts != 1:
+        redshiftDuringTraining = False
 
     dataDirName = os.path.join(scriptDirectory, 'data_files_{0}/'.format(modelName))
     dataFilenames = []
@@ -58,7 +63,7 @@
     print("time spent: {0:.2f}".format(t3 - t2))
 
     # TRAIN TENSORFLOW MODEL
-    modelFilenames = train_model(dataDirName, overwrite=True, numTrainBatches=numTrainBatches)
+    modelFilenames = train_model(dataDirName, overwrite=True, numTrainBatches=numTrainBatches, minZ=minZ, maxZ=maxZ, redshifting=redshiftDuringTraining)
     dataFilenames.extend(modelFilenames)
     t4 = time.time()
     print("time spent: {0:.2f}".format(t4 - t3))

diff --git a/dash/create_arrays.py b/dash/create_arrays.py
@@ -1,5 +1,3 @@
-import os
-import glob
 import numpy as np
 from random import shuffle
 import multiprocessing as mp
@@ -294,21 +292,19 @@ def __init__(self, w0, w1, nw, nTypes, minAge, maxAge, ageBinSize, typeList, min
         self.createLabels = CreateLabels(self.nTypes, self.minAge, self.maxAge, self.ageBinSize, self.typeList, hostTypes, nHostTypes)
         self.hostTypes = hostTypes
 
+        # TODO: Maybe do memory mapping for these arrays
         self.images = []
         self.labelsIndexes = []
         self.filenames = []
         self.typeNames = []
 
     def combined_sn_gal_templates_to_arrays(self, args):
         snTemplateLocation, snTempList, galTemplateLocation, galTempList, snFractions = args
-
-        randnum = np.random.randint(10000)
-        arraySize = len(galTempList) * len(snTempList) * 50 * len(snFractions) * self.numOfRedshifts
-        images = np.memmap('images_{}_{}.dat'.format(snTempList[0], randnum), dtype=np.float16, mode='w+', shape=(arraySize, int(self.nw)))
-        labelsIndexes = np.memmap('labels_{}_{}.dat'.format(snTempList[0], randnum), dtype=np.uint16, mode='w+', shape=arraySize)
-        filenames = np.memmap('filenames_{}_{}.dat'.format(snTempList[0], randnum), dtype=object, mode='w+', shape=arraySize)
-        typeNames = np.memmap('typeNames_{}_{}.dat'.format(snTempList[0], randnum), dtype=object, mode='w+', shape=arraySize)
-        nRows = 0
+        images = np.empty((0, int(self.nw)), np.float16)  # Number of pixels
+        labelsIndexes = []
+        filenames = []
+        typeNames = []
+        agesList = []
 
         for j in range(len(galTempList)):
             galFilename = galTemplateLocation + galTempList[j] if galTemplateLocation is not None else None
@@ -326,6 +322,7 @@ def combined_sn_gal_templates_to_arrays(self, args):
                             redshifts = np.random.uniform(low=self.minZ, high=self.maxZ, size=self.numOfRedshifts)
                         for z in redshifts:
                             tempWave, tempFlux, nCols, ages, tType, tMinIndex, tMaxIndex = readSpectra.sn_plus_gal_template(ageidx, snCoeff, galCoeff, z)
+                            agesList.append(ages[ageidx])
                             if tMinIndex == tMaxIndex or not tempFlux.any():
                                 print("NO DATA for {} {} ageIdx:{} z>={}".format(galTempList[j], snTempList[i], ageidx, z))
                                 break
@@ -340,22 +337,21 @@ def combined_sn_gal_templates_to_arrays(self, args):
                                 nonzeroflux = tempFlux[tMinIndex:tMaxIndex + 1]
                                 newflux = (nonzeroflux - min(nonzeroflux)) / (max(nonzeroflux) - min(nonzeroflux))
                                 newflux2 = np.concatenate((tempFlux[0:tMinIndex], newflux, tempFlux[tMaxIndex + 1:]))
-                                images[nRows] = np.array([newflux2])
-                                labelsIndexes[nRows] = labelIndex
-                                filenames[nRows] = "{0}_{1}_{2}_{3}_snCoeff{4}_z{5}".format(snTempList[i], tType, str(ages[ageidx]), galTempList[j], snCoeff, (z))
-                                typeNames[nRows] = typeName
-                                nRows += 1
+                                images = np.append(images, np.array([newflux2]), axis=0)
+                                labelsIndexes.append(labelIndex) # labels = np.append(labels, np.array([label]), axis=0)
+                                filenames.append("{0}_{1}_{2}_{3}_snCoeff{4}_z{5}".format(snTempList[i], tType, str(ages[ageidx]), galTempList[j], snCoeff, (z)))
+                                typeNames.append(typeName)
                 print(snTempList[i], nCols, galTempList[j])
 
-        return images, np.array(labelsIndexes).astype(int), np.array(filenames), np.array(typeNames), nRows
+        return images, np.array(labelsIndexes).astype(int), np.array(filenames), np.array(typeNames)
 
     def collect_results(self, result):
         """Uses apply_async's callback to setup up a separate Queue for each process"""
-        imagesPart, labelsPart, filenamesPart, typeNamesPart, nRows = result
-        self.images.extend(imagesPart[0:nRows])
-        self.labelsIndexes.extend(labelsPart[0:nRows])
-        self.filenames.extend(filenamesPart[0:nRows])
-        self.typeNames.extend(typeNamesPart[0:nRows])
+        imagesPart, labelsPart, filenamesPart, typeNamesPart = result
+        self.images.extend(imagesPart)
+        self.labelsIndexes.extend(labelsPart)
+        self.filenames.extend(filenamesPart)
+        self.typeNames.extend(typeNamesPart)
 
     def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileList, galTemplateLocation, galTempFileList):
         if galTemplateLocation is None or galTempFileList is None:
@@ -380,7 +376,7 @@ def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileL
         outputs = results.get()
         for i, output in enumerate(outputs):
             self.collect_results(output)
-            print('combining results...', output[-1], i, len(outputs))
+            print('combining results...', i, len(outputs))
 
         self.images = np.array(self.images)
         self.labelsIndexes = np.array(self.labelsIndexes)
@@ -389,14 +385,4 @@ def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileL
 
         print("Completed Creating Arrays!")
 
-        # Delete temporary memory mapping files
-        for filename in glob.glob('images_*.dat'):
-            os.remove(filename)
-        for filename in glob.glob('labels*.dat'):
-            os.remove(filename)
-        for filename in glob.glob('filenames_*.dat'):
-            os.remove(filename)
-        for filename in glob.glob('typeNames_*.dat'):
-            os.remove(filename)
-
         return self.images, self.labelsIndexes.astype(np.uint16), self.filenames, self.typeNames