In [None]:
import pandas as pd
import numpy as np
from benchmarks import *
import glob
import os, sys
import seaborn as sns
import matplotlib.pyplot as plt
import re
import time
from tqdm.notebook import tqdm

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
MACHINE = 'lassen' if 'lassen' in ROOT_DIR else 'ruby'
print(MACHINE, ROOT_DIR)
prognames = list(progs.keys())
prognames = ['bt_nas']
probsizes = ['smlprob', 'medprob', 'lrgprob']
probsizes = ['smlprob']

logsDir = ROOT_DIR+'/logs'

goMethods=['cma', 'pso', 'bo']
seeds = [1337, 3827, 9999, 4873]

In [None]:
def getHypersFromFilename(filename):

	# cut the path out if it has a path, and remove the .csv extension
	filename = os.path.splitext(os.path.basename(filename))[0]

	if '-BO-' in filename:
		if '-ucb-' in filename:
			filename = filename[filename.find('-ucb-')+5:]
			found = re.findall(r'(?:k)(.*)(?:-kd)(.*)(?:-kdd)(.*)(?:--DONE)',filename)[0]
			k = float(found[0])
			kd = float(found[1])
			kdd = int(found[2])
			return {'utilFnct':'ucb', 'kappa':k, 'kappa_decay':kd, 'kappa_decay_delay':kdd}
		
		elif '-ei-' in filename:
			filename = filename[filename.find('-ei-')+4:]
			found = re.findall(r'(?:xi)(.*)(?:--DONE)',filename)[0]
			xi = float(found)
			return {'utilFnct':'ei', 'xi':xi}
		
		elif '-poi-' in filename:
			filename = filename[filename.find('-poi-')+5:]
			found = re.findall(r'(?:xi)(.*)(?:--DONE)',filename)[0]
			xi = float(found)
			return {'utilFnct':'poi', 'xi':xi}

	elif '-PSO-' in filename:
			filename = filename[filename.find('-PSO-')+5:]
			found = re.findall(r'(?:pop)(.*)(?:-w)(.*)(?:-c1)(.*)(?:-c2)(.*)(?:--DONE)',filename)[0]
			pop = int(found[0])
			w = float(found[1])
			c1 = float(found[2])
			c2 = float(found[3])
			return {'popsize':pop, 'w':w, 'c1':c1, 'c2':c2}
	
	elif '-CMA-' in filename:
			filename = filename[filename.find('-CMA-')+5:]
			found = re.findall(r'(?:sigma)(.*)(?:-pop)(.*)(?:-popdecay)(.*)(?:--DONE)',filename)[0]
			sigma = float(found[0])
			pop = int(found[1])
			pd = float(found[2])
			#return {'method':'cma', 'sigma':sigma, 'popsize':pop, 'popdecay':pd}
			# excluded popdecay in experiments
			return {'sigma':sigma, 'popsize':pop}
	
	else:
		print('no GO method in filename...')
		raise ValueError('Failed to provide a string path with a GO Method')
		return


In [None]:
def getData(progname, probsize, method, seed):
	mainDF = pd.DataFrame(columns=['progname', 'probsize', 'method', 'seed'])
	startTime = time.time()

	dir = logsDir+f'/{progname}-{probsize}/{method}-{seed}'
	print('working on', dir)
	# check that the directory exists
	# grab the completed data
	csvs = glob.glob(dir+'/*DONE.csv')

	#print(f'{progname}-{probsize}/{method}-{seed} first 5')
	# print out the first step of the first CSVs, should be the same

	#for i in [0,1,6,10]:
	#	csv = pd.read_csv(csvs[i])
	#	print(csvs[i])
	#	print(csv.head(1))

	# read in each of the dataframe, then contatenate them all at once
	readin = []
	for csv in tqdm(csvs):
		# extract the hyperparam values from the filename
		hypers = getHypersFromFilename(csv)
		#toadd = pd.read_csv(csv, usecols=['globalSample', 'optimXtime', 'xtime'])
		toadd = pd.read_csv(csv)

		# add the extra columns
		for hyper,val in hypers.items():
			toadd[hyper] = val

		readin += [toadd]

	mainDF = pd.concat([mainDF]+readin, ignore_index=True)

	mainDF['progname'] = progname
	mainDF['probsize'] = probsize
	mainDF['method'] = method
	mainDF['seed'] = seed 

	print(f'{progname}-{probsize}/{method}-{seed} starter configurations')
	if 'bo' in method:
		print(mainDF[mainDF['globalSample'] == 0].drop_duplicates(ignore_index=True))
	else:
		print(mainDF[mainDF['globalSample'] < mainDF['popsize']].drop_duplicates(ignore_index=True))

	print('completed', progname, probsize, method, seed, mainDF.shape, (time.time() - startTime), 'secs')		
	return mainDF

In [None]:
dbFile = f'{MACHINE}-fullExploreDataset.csv'
xtimeDB = pd.read_csv(ROOT_DIR+'/databases/'+dbFile)

def getMinMaxXtimeForProg(progname, probsize):
	df = xtimeDB[(xtimeDB['progname'] == progname) & 
							 (xtimeDB['probsize'] == probsize)]['xtime']
	return (df.min(), df.max())

# 0 will mean close to maxXtime
# 1 will mean close to minXtime
def convertXtimesToPercent(df, minXtime, maxXtime):
	normed = (df['xtime']-minXtime)/(maxXtime-minXtime)

	# if any values are larger than 1, we need to cap them
	# this only happens if the max is smaller than an xtime (like when we baseline normalize)
	normed = normed.apply(lambda x: 1.0 if x > 1.0 else x)

	df['xtime'] = 1 - normed
	return


In [None]:
def preprocessAllLogs(minMaxNorm=False, speedupNorm=False, onlyShowSeedHeads=False):

	assert not (minMaxNorm & speedupNorm)

	for method in goMethods:
		for progname in prognames:
			# gather all the data
			toJoin = []
			for probsize in probsizes:
				minXtime, maxXtime = getMinMaxXtimeForProg(progname, probsize)
				for seed in seeds:
					df = getData(progname, probsize, method, seed)


					# if there's no data
					if df.shape[0] == 0:
						continue

					if minMaxNorm:
						convertXtimesToPercent(df, minXtime, maxXtime)

					elif speedupNorm:
						numthreads = 56 if MACHINE in 'ruby' else 80
						baseline = xtimeDB[(xtimeDB['progname'] == progname)
														 & (xtimeDB['probsize'] == probsize)
														 & (xtimeDB['OMP_NUM_THREADS'] == numthreads) 
														 & (xtimeDB['OMP_PROC_BIND'] == 'close')
														 & (xtimeDB['OMP_PLACES'] == 'cores')
														 & (xtimeDB['OMP_SCHEDULE'] == 'static')]
						assert baseline.shape[0] == 1
						baselineVal = baseline['xtime'].iat[0]
						df.loc[:,'xtime'] = baselineVal/df['xtime']

					toJoin += [df]


			# if there's no data to join
			if len(toJoin) == 0:
				continue

			if onlyShowSeedHeads:
				continue

			fullDF = pd.concat(toJoin, ignore_index=True)

			# let's save the dataframe for future re-use
			if minMaxNorm:
				filename = ROOT_DIR+'/databases/'+f'{MACHINE}-{progname}-{method}-GO_Data-minMaxNorm.csv'
			elif speedupNorm:
				filename = ROOT_DIR+'/databases/'+f'{MACHINE}-{progname}-{method}-GO_Data-baselineNorm.csv'
			else:
				filename = ROOT_DIR+'/databases/'+f'{MACHINE}-{progname}-{method}-GO_Data-rawXtimes.csv'
			print('\n\n wrote:', filename, '\n\n')
			fullDF.to_csv(filename, index=False)
	return

In [None]:
#preprocessAllLogs(speedupNorm=True)
#preprocessAllLogs(minMaxNorm=True)
#preprocessAllLogs()
preprocessAllLogs(onlyShowSeedHeads=True)