Skip to content

Commit

Permalink
Redshifting during training to reduce the array size before training.…
Browse files Browse the repository at this point in the history
… Redshifting using log wavelength scale, and apodizing after this.

Added commands `astrodash` and `dash` to setup.py so that running this will open the GUI.
Saving old memory mapping method (in case arrays get too large again) as create_arrays_with_memory_mapping.py.
No longer doing memory mapping in create_arrays.py because some of the augmentations steps that were causing large arrays have been moved to occur during training rather than before.
  • Loading branch information
daniel-muthukrishna committed Jul 6, 2018
1 parent 7129383 commit d273d75
Show file tree
Hide file tree
Showing 8 changed files with 523 additions and 40 deletions.
2 changes: 1 addition & 1 deletion dash/classify_OzDES_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main(runDirectory, atelTextFile, saveMatchesFilename):
print("{0:17} | {1:5} | {2:8} | {3:10} | {4:6} | {5:10} | {6:10}".format('_'.join([filenames[i].split('/')[-1].split('_')[0], filenames[i].split('/')[-1].split('_')[3]]) , redshifts[i], bestTypes[i][0], bestTypes[i][1], bestTypes[i][2], matchesFlag[i].replace(' matches',''), wikiClassifications[i]))

# Plot one of the matches
classification.plot_with_gui(indexToPlot=7)
classification.plot_with_gui(indexToPlot=0)


if __name__ == '__main__':
Expand Down
11 changes: 8 additions & 3 deletions dash/create_and_save_all_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@
classifyHost = False
minZ = 0.
maxZ = 0.8
numOfRedshifts = 50
trainFraction = 1.0
redshiftDuringTraining = True
trainFraction = 1
numTrainBatches = 2000000
# Do not change this unless we want to redshift before training.
numOfRedshifts = 1

if numOfRedshifts != 1:
redshiftDuringTraining = False

dataDirName = os.path.join(scriptDirectory, 'data_files_{0}/'.format(modelName))
dataFilenames = []
Expand Down Expand Up @@ -58,7 +63,7 @@
print("time spent: {0:.2f}".format(t3 - t2))

# TRAIN TENSORFLOW MODEL
modelFilenames = train_model(dataDirName, overwrite=True, numTrainBatches=numTrainBatches)
modelFilenames = train_model(dataDirName, overwrite=True, numTrainBatches=numTrainBatches, minZ=minZ, maxZ=maxZ, redshifting=redshiftDuringTraining)
dataFilenames.extend(modelFilenames)
t4 = time.time()
print("time spent: {0:.2f}".format(t4 - t3))
Expand Down
50 changes: 18 additions & 32 deletions dash/create_arrays.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import os
import glob
import numpy as np
from random import shuffle
import multiprocessing as mp
Expand Down Expand Up @@ -294,21 +292,19 @@ def __init__(self, w0, w1, nw, nTypes, minAge, maxAge, ageBinSize, typeList, min
self.createLabels = CreateLabels(self.nTypes, self.minAge, self.maxAge, self.ageBinSize, self.typeList, hostTypes, nHostTypes)
self.hostTypes = hostTypes

# TODO: Maybe do memory mapping for these arrays
self.images = []
self.labelsIndexes = []
self.filenames = []
self.typeNames = []

def combined_sn_gal_templates_to_arrays(self, args):
snTemplateLocation, snTempList, galTemplateLocation, galTempList, snFractions = args

randnum = np.random.randint(10000)
arraySize = len(galTempList) * len(snTempList) * 50 * len(snFractions) * self.numOfRedshifts
images = np.memmap('images_{}_{}.dat'.format(snTempList[0], randnum), dtype=np.float16, mode='w+', shape=(arraySize, int(self.nw)))
labelsIndexes = np.memmap('labels_{}_{}.dat'.format(snTempList[0], randnum), dtype=np.uint16, mode='w+', shape=arraySize)
filenames = np.memmap('filenames_{}_{}.dat'.format(snTempList[0], randnum), dtype=object, mode='w+', shape=arraySize)
typeNames = np.memmap('typeNames_{}_{}.dat'.format(snTempList[0], randnum), dtype=object, mode='w+', shape=arraySize)
nRows = 0
images = np.empty((0, int(self.nw)), np.float16) # Number of pixels
labelsIndexes = []
filenames = []
typeNames = []
agesList = []

for j in range(len(galTempList)):
galFilename = galTemplateLocation + galTempList[j] if galTemplateLocation is not None else None
Expand All @@ -326,6 +322,7 @@ def combined_sn_gal_templates_to_arrays(self, args):
redshifts = np.random.uniform(low=self.minZ, high=self.maxZ, size=self.numOfRedshifts)
for z in redshifts:
tempWave, tempFlux, nCols, ages, tType, tMinIndex, tMaxIndex = readSpectra.sn_plus_gal_template(ageidx, snCoeff, galCoeff, z)
agesList.append(ages[ageidx])
if tMinIndex == tMaxIndex or not tempFlux.any():
print("NO DATA for {} {} ageIdx:{} z>={}".format(galTempList[j], snTempList[i], ageidx, z))
break
Expand All @@ -340,22 +337,21 @@ def combined_sn_gal_templates_to_arrays(self, args):
nonzeroflux = tempFlux[tMinIndex:tMaxIndex + 1]
newflux = (nonzeroflux - min(nonzeroflux)) / (max(nonzeroflux) - min(nonzeroflux))
newflux2 = np.concatenate((tempFlux[0:tMinIndex], newflux, tempFlux[tMaxIndex + 1:]))
images[nRows] = np.array([newflux2])
labelsIndexes[nRows] = labelIndex
filenames[nRows] = "{0}_{1}_{2}_{3}_snCoeff{4}_z{5}".format(snTempList[i], tType, str(ages[ageidx]), galTempList[j], snCoeff, (z))
typeNames[nRows] = typeName
nRows += 1
images = np.append(images, np.array([newflux2]), axis=0)
labelsIndexes.append(labelIndex) # labels = np.append(labels, np.array([label]), axis=0)
filenames.append("{0}_{1}_{2}_{3}_snCoeff{4}_z{5}".format(snTempList[i], tType, str(ages[ageidx]), galTempList[j], snCoeff, (z)))
typeNames.append(typeName)
print(snTempList[i], nCols, galTempList[j])

return images, np.array(labelsIndexes).astype(int), np.array(filenames), np.array(typeNames), nRows
return images, np.array(labelsIndexes).astype(int), np.array(filenames), np.array(typeNames)

def collect_results(self, result):
"""Uses apply_async's callback to setup up a separate Queue for each process"""
imagesPart, labelsPart, filenamesPart, typeNamesPart, nRows = result
self.images.extend(imagesPart[0:nRows])
self.labelsIndexes.extend(labelsPart[0:nRows])
self.filenames.extend(filenamesPart[0:nRows])
self.typeNames.extend(typeNamesPart[0:nRows])
imagesPart, labelsPart, filenamesPart, typeNamesPart = result
self.images.extend(imagesPart)
self.labelsIndexes.extend(labelsPart)
self.filenames.extend(filenamesPart)
self.typeNames.extend(typeNamesPart)

def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileList, galTemplateLocation, galTempFileList):
if galTemplateLocation is None or galTempFileList is None:
Expand All @@ -380,7 +376,7 @@ def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileL
outputs = results.get()
for i, output in enumerate(outputs):
self.collect_results(output)
print('combining results...', output[-1], i, len(outputs))
print('combining results...', i, len(outputs))

self.images = np.array(self.images)
self.labelsIndexes = np.array(self.labelsIndexes)
Expand All @@ -389,14 +385,4 @@ def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileL

print("Completed Creating Arrays!")

# Delete temporary memory mapping files
for filename in glob.glob('images_*.dat'):
os.remove(filename)
for filename in glob.glob('labels*.dat'):
os.remove(filename)
for filename in glob.glob('filenames_*.dat'):
os.remove(filename)
for filename in glob.glob('typeNames_*.dat'):
os.remove(filename)

return self.images, self.labelsIndexes.astype(np.uint16), self.filenames, self.typeNames

0 comments on commit d273d75

Please sign in to comment.