In [1]:
import os
import argparse
import sys
import time
import pandas as pd
import re
from pathlib import Path

In [2]:
APOLLO_DATA_COLLECTION_DIR='/usr/WS2/bolet1/apolloDataCollection'
#APOLLO_DATA_COLLECTION_DIR='/g/g15/bolet1/workspace/apolloDataCollection/finalData/quartz/static_runs_no_traces'

# Open the PA and VA csv files
VAdf = pd.read_csv(APOLLO_DATA_COLLECTION_DIR+'/online-ETE-XTimeData_VA.csv')
PAdf = pd.read_csv(APOLLO_DATA_COLLECTION_DIR+'/online-ETE-XTimeData_PA.csv')

In [5]:
print(VAdf.shape)
print(PAdf.shape)
print(VAdf.columns)

(1811, 7)
(1812, 7)
Index(['progname', 'probSize', 'policy', 'minTrainData', 'trialnum',
       'eteXtime', 'type'],
      dtype='object')


In [4]:
print('Datasets loaded!')

# Join both datasets together into one
VAdf['type'] = 'VA'
PAdf['type'] = 'PA'

# Just one dataframe to worry about now
rawdf = pd.concat([VAdf, PAdf])

Datasets loaded!


In [7]:
# Let's preprocess and remove runs that don't have the same counts
grouped = rawdf.groupby(['progname', 'probSize', 'policy', 'minTrainData', 'type'])

#print(grouped['type'].count())
counts = grouped.count().reset_index()
print(counts)

# Separate the PAs from the VAs, then subtract
vas = counts.loc[counts['type'] == 'VA']
pas = counts.loc[counts['type'] == 'PA']

# Now let's merge the datasets s.t. we enforce matching trial counts
# this df has all the overlapping cases with the same trial count
# from VA and PA
filtered = pd.merge(vas, pas, how='inner', on=['progname', 'probSize', 'policy', 'minTrainData', 'trialnum', 'eteXtime'])

targets = filtered[['progname', 'probSize', 'policy', 'minTrainData']]

# Now let's filter out the parent dataframe
df = pd.merge(targets, rawdf, how='inner', on=['progname', 'probSize', 'minTrainData', 'policy'])
df = df.sort_values(by=['progname', 'probSize', 'policy', 'minTrainData', 'trialnum', 'type'])
print(df.head())

    progname   probSize                                       policy  \
0       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
1       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
2       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
3       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
4       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
5       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
6       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
7       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
8       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
9       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
10      comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
11      comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
12      comd  largeprob  DecisionTree,max_depth=4,explore=RoundR

In [8]:
# Now that we've cleaned the dataset, lets calculate means
grouped = df.groupby(['progname', 'probSize', 'policy', 'minTrainData', 'type'])

# Compute avrg and stddev
means = grouped['eteXtime'].mean().reset_index()
stds = grouped['eteXtime'].std().reset_index()

summdf = means.copy()
summdf.drop('eteXtime', axis=1, inplace=True)

summdf['mean_etextime'] = means['eteXtime']
summdf['std_etextime'] = stds['eteXtime']

print(summdf)


    progname   probSize                                       policy  \
0       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
1       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
2       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
3       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
4       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
5       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
6       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
7       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
8       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
9       comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
10      comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
11      comd  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
12      comd  largeprob  DecisionTree,max_depth=4,explore=RoundR

In [10]:

vadf = summdf.loc[summdf['type'] == 'VA']
padf = summdf.loc[summdf['type'] == 'PA']

vadf = vadf.sort_values(by=['progname', 'probSize', 'policy', 'minTrainData'])
padf = padf.sort_values(by=['progname', 'probSize', 'policy', 'minTrainData'])

# Now let's take the diffs between PA and VA mean xtimes
timeDiffs = padf['mean_etextime'].to_numpy() - vadf['mean_etextime'].to_numpy()

# If any values are greater than 0, then PA was slower
padf['pa_minus_va'] = timeDiffs
padf['va_mean_ete_xtime'] = vadf['mean_etextime'].to_numpy()

# Now let's get percent xtime diff
padf['perc_xtime_diff'] = (timeDiffs * 100) / (vadf['mean_etextime'].to_numpy())

padf = padf.sort_values(by=['perc_xtime_diff'])

print(padf)

        progname   probSize                                       policy  \
762  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
764  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
782  rodinia_lud    medprob  DecisionTree,max_depth=4,explore=RoundRobin   
766  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
768  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
772  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
774  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
784  rodinia_lud    medprob  DecisionTree,max_depth=4,explore=RoundRobin   
786  rodinia_lud    medprob  DecisionTree,max_depth=4,explore=RoundRobin   
788  rodinia_lud    medprob  DecisionTree,max_depth=4,explore=RoundRobin   
776  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
770  rodinia_lud  largeprob  DecisionTree,max_depth=4,explore=RoundRobin   
778  rodinia

In [17]:


# now let's lookat the lulesh runs
#print(padf.loc[padf['progname'] == 'rodinia_lud'])

# Let's look at the outlier runs too
print(padf.loc[(padf['perc_xtime_diff'] <= -50) | (padf['perc_xtime_diff'] >= 50), 
               ['progname', 'probSize', 'minTrainData', 'perc_xtime_diff']])

        progname   probSize  minTrainData  perc_xtime_diff
762  rodinia_lud  largeprob             3       -97.466639
764  rodinia_lud  largeprob             6       -97.322345
782  rodinia_lud    medprob             3       -97.217390
766  rodinia_lud  largeprob             9       -97.131616
768  rodinia_lud  largeprob            12       -96.986970
772  rodinia_lud  largeprob            18       -96.615114
774  rodinia_lud  largeprob            21       -96.368442
784  rodinia_lud    medprob             6       -96.308878
786  rodinia_lud    medprob             9       -96.190741
788  rodinia_lud    medprob            12       -96.183286
776  rodinia_lud  largeprob            24       -96.160526
770  rodinia_lud  largeprob            15       -95.726939
778  rodinia_lud  largeprob            27       -95.716634
790  rodinia_lud    medprob            15       -95.621221
802  rodinia_lud  smallprob             3       -95.544329
780  rodinia_lud  largeprob            30       -95.5264