In [9]:
import pandas as pd
import numpy as np
from benchmarks import *
import glob
import os, sys
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
# for each of the programs/prob sizes in the explorData directory,
# load up all their CSV data

os.chdir(ROOT_DIR+'/explorData')

# extract the names and problem sizes of the done codes
dirs = list(os.listdir())

dirs.sort()

doneRuns = {}

for dir in dirs:
	progname = dir.split('-')[0]
	probsize = dir.split('-')[1]

	allJobs = glob.glob('./'+dir+'/allUniquePointsToSample.csv')[0]
	doneFiles = glob.glob('./'+dir+'/*/complete.csv')

	allData = pd.read_csv(allJobs)

	doneData = pd.DataFrame(columns=['xtime']+list(allData.columns))

	# join all the done files into one csv
	for doneFile in doneFiles:
		comp = pd.read_csv(doneFile)
		doneData = pd.concat([doneData, comp], ignore_index=True)

	doneData = doneData.reset_index(drop=True)

	# we remove this line just to get the plots up
	doneData = doneData[doneData['xtime'] != -1.0]

	# if we have all the data, let's analyze it
	if doneData.shape[0] == allData.shape[0]:
		print(progname, probsize, end='\t')
		print(doneData.shape, allData.shape, 'all samples collected!')
		doneRuns[dir] = {'progname':progname, 'probsize': probsize, 'df':doneData}
	else:
		print('\t', progname, probsize, end='\t')
		print('incomplete data! Collected', doneData.shape[0], '/', allData.shape[0], 'samples')





	 bfs_rodinia lrgprob	incomplete data! Collected 0 / 3564 samples
bfs_rodinia medprob	(3564, 7) (3564, 6) all samples collected!
bfs_rodinia smlprob	(3564, 7) (3564, 6) all samples collected!
	 bt_nas lrgprob	incomplete data! Collected 1630 / 3564 samples
	 bt_nas medprob	incomplete data! Collected 3561 / 3564 samples
	 bt_nas smlprob	incomplete data! Collected 3562 / 3564 samples
	 cfd_rodinia lrgprob	incomplete data! Collected 0 / 3564 samples
cfd_rodinia medprob	(3564, 7) (3564, 6) all samples collected!
cfd_rodinia smlprob	(3564, 7) (3564, 6) all samples collected!
	 cg_nas lrgprob	incomplete data! Collected 3560 / 3564 samples
cg_nas medprob	(3564, 7) (3564, 6) all samples collected!
cg_nas smlprob	(3564, 7) (3564, 6) all samples collected!
	 ft_nas lrgprob	incomplete data! Collected 2432 / 3564 samples
ft_nas medprob	(3564, 7) (3564, 6) all samples collected!
ft_nas smlprob	(3564, 7) (3564, 6) all samples collected!
	 hpcg lrgprob	incomplete data! Collected 0 / 3564 samples
	 hpc

In [None]:
# for each complete run, let's plot the data

for key,values in doneRuns.items():
	progname = values['progname']
	probsize = values['probsize']
	df = values['df'].copy(deep=True)

	# drop any -1 values
	df = df[df['xtime'] != -1.0]

	# drop the progname and probsize columns
	df = df.drop(columns=['progname', 'probsize'])

	#print(df.head(), df.shape)

	hparams = list(df.columns)
	hparams.remove('xtime')

	# average out the xtimes
	avrgd = df.groupby(hparams).mean().reset_index()

	print(progname, probsize)
	# best xtime configs
	print(avrgd.sort_values(by=['xtime']).head(10))
	print(avrgd[(avrgd['OMP_NUM_THREADS'] == 160) & (avrgd['OMP_SCHEDULE'] == 'static')])

	# let's create multiple 2D plots of the xtimes
	pp = sns.pairplot(avrgd, y_vars=hparams, x_vars=['xtime']) 
	pp.fig.set_size_inches(15,15)
	pp.fig.suptitle(progname+' ('+probsize+')', x=0.7, y=0.999)

	for ax in pp.axes.flat:
		if ax.get_ylabel() in hparams:
			ax.set(xscale='log')

	plt.show()


	
	
