In [4]:
import os
import argparse
import sys
import time
import pandas as pd
import re
from pathlib import Path

pd.set_option('display.max_rows', None)

APOLLO_DATA_COLLECTION_DIR='/usr/WS2/bolet1/apolloDataCollection'

# Open the PA and VA csv files
VAdf = pd.read_csv(APOLLO_DATA_COLLECTION_DIR+'/static-ETE-XTimeData_VA.csv')
PAdf = pd.read_csv(APOLLO_DATA_COLLECTION_DIR+'/static-ETE-XTimeData_PA.csv')

print('Datasets loaded!')

# Join both datasets together into one
VAdf['type'] = 'VA'
PAdf['type'] = 'PA'

# Just one dataframe to worry about now
rawdf = pd.concat([VAdf, PAdf])

# for now, let's only focus on runs with policy 0
#rawdf = rawdf.loc[rawdf['policy'] == "Static,policy=0"]
# let's drop some programs we don't care about
rawdf = rawdf.loc[rawdf['progname'] != "rodinia_backprop"]
rawdf = rawdf.loc[rawdf['progname'] != "rodinia_nn"]
rawdf = rawdf.loc[rawdf['progname'] != "rodinia_nw"]

# Let's preprocess and remove runs that don't have the same counts
grouped = rawdf.groupby(['progname', 'probSize', 'policy', 'type'])

#print(grouped['type'].count())
counts = grouped.count().reset_index()
print(counts)

# Separate the PAs from the VAs, then subtract
vas = counts.loc[counts['type'] == 'VA']
pas = counts.loc[counts['type'] == 'PA']

# Now let's merge the datasets s.t. we enforce matching trial counts
# this df has all the overlapping cases with the same trial count
# from VA and PA
filtered = pd.merge(vas, pas, how='inner', on=['progname', 'probSize', 'policy', 'trialnum', 'eteXtime'])

targets = filtered[['progname', 'probSize', 'policy']]

# Now let's filter out the parent dataframe
df = pd.merge(targets, rawdf, how='inner', on=['progname', 'probSize', 'policy'])
df = df.sort_values(by=['progname', 'probSize', 'policy', 'trialnum', 'type'])
print(df.head())

#df = df.loc[df['trialnum'] > 3]

# Now that we've cleaned the dataset, lets calculate means
grouped = df.groupby(['progname', 'probSize', 'policy', 'type'])

# Compute avrg and stddev
means = grouped['eteXtime'].mean().reset_index()
stds = grouped['eteXtime'].std().reset_index()

summdf = means.copy()
summdf.drop('eteXtime', axis=1, inplace=True)

summdf['mean_etextime'] = means['eteXtime']
summdf['std_etextime'] = stds['eteXtime']

print(summdf)

vadf = summdf.loc[summdf['type'] == 'VA']
padf = summdf.loc[summdf['type'] == 'PA']

vadf = vadf.sort_values(by=['progname', 'probSize', 'policy'])
padf = padf.sort_values(by=['progname', 'probSize', 'policy'])

# Now let's take the diffs between PA and VA mean xtimes
timeDiffs = padf['mean_etextime'].to_numpy() - vadf['mean_etextime'].to_numpy()

# If any values are greater than 0, then PA was slower
padf['pa_minus_va'] = timeDiffs
padf['va_mean_ete_xtime'] = vadf['mean_etextime'].to_numpy()

# Now let's get percent xtime diff
padf['perc_xtime_diff'] = (timeDiffs * 100) / (vadf['mean_etextime'].to_numpy())

padf = padf.sort_values(by=['perc_xtime_diff'])






Datasets loaded!
               progname   probSize           policy type  trialnum  eteXtime
0                  comd  largeprob  Static,policy=0   PA        10        10
1                  comd  largeprob  Static,policy=0   VA        10        10
2                  comd  largeprob  Static,policy=1   PA        10        10
3                  comd  largeprob  Static,policy=1   VA        10        10
4                  comd    medprob  Static,policy=0   PA        10        10
5                  comd    medprob  Static,policy=0   VA        10        10
6                  comd    medprob  Static,policy=1   PA        10        10
7                  comd    medprob  Static,policy=1   VA        10        10
8                  comd  smallprob  Static,policy=0   PA        10        10
9                  comd  smallprob  Static,policy=0   VA        10        10
10                 comd  smallprob  Static,policy=1   PA        10        10
11                 comd  smallprob  Static,policy=1   VA   

In [7]:
#print(padf)

# now let's lookat the lulesh runs
#print(padf.loc[padf['progname'] == 'lulesh'])

# Let's look at the outlier runs too
print(padf.loc[(padf['perc_xtime_diff'] <= -4) | (padf['perc_xtime_diff'] >= 4)])

               progname   probSize           policy type  mean_etextime  \
200             xsbench  largeprob  Static,policy=1   PA       2.571700   
198             xsbench  largeprob  Static,policy=0   PA       1.784100   
20               lulesh  smallprob  Static,policy=1   PA      71.100000   
192  rodinia_pathfinder    medprob  Static,policy=1   PA       0.010328   
60               nas_cg  smallprob  Static,policy=0   PA       4.805000   
194  rodinia_pathfinder  smallprob  Static,policy=0   PA       0.014633   
190  rodinia_pathfinder    medprob  Static,policy=0   PA       0.010304   
186  rodinia_pathfinder  largeprob  Static,policy=0   PA       0.010569   
184         rodinia_lud  smallprob  Static,policy=1   PA       0.527267   
188  rodinia_pathfinder  largeprob  Static,policy=1   PA       0.011155   
182         rodinia_lud  smallprob  Static,policy=0   PA       0.420590   

     std_etextime  pa_minus_va  va_mean_ete_xtime  perc_xtime_diff  
200      0.023519    -0.145500

In [9]:
# Now let's zoom in and see the samples for rodinia_lud since it has the highest
# xtime difference across VA and PA

print(df.loc[(df['progname'] == 'rodinia_lud') & 
             (df['policy'] == 'Static,policy=0') & 
             (df['probSize'] == 'smallprob') &
             (df['type'] == 'PA')])

print(df.loc[(df['progname'] == 'rodinia_lud') & 
             (df['policy'] == 'Static,policy=0') & 
             (df['probSize'] == 'smallprob') &
             (df['type'] == 'VA')])

         progname   probSize           policy  trialnum  eteXtime type
1812  rodinia_lud  smallprob  Static,policy=0         0  0.447834   PA
1813  rodinia_lud  smallprob  Static,policy=0         1  0.347258   PA
1814  rodinia_lud  smallprob  Static,policy=0         2  0.357280   PA
1815  rodinia_lud  smallprob  Static,policy=0         3  0.891652   PA
1816  rodinia_lud  smallprob  Static,policy=0         4  0.360750   PA
1817  rodinia_lud  smallprob  Static,policy=0         5  0.360228   PA
1818  rodinia_lud  smallprob  Static,policy=0         6  0.364948   PA
1819  rodinia_lud  smallprob  Static,policy=0         7  0.353635   PA
1820  rodinia_lud  smallprob  Static,policy=0         8  0.360270   PA
1821  rodinia_lud  smallprob  Static,policy=0         9  0.362050   PA
         progname   probSize           policy  trialnum  eteXtime type
1802  rodinia_lud  smallprob  Static,policy=0         0  0.339245   VA
1803  rodinia_lud  smallprob  Static,policy=0         1  0.312462   VA
1804  

In [6]:
print(df.loc[(df['progname'] == 'quicksilver') & 
             (df['policy'] == 'Static,policy=0') & 
             (df['probSize'] == 'largeprob') &
             (df['type'] == 'PA')])

print(df.loc[(df['progname'] == 'quicksilver') & 
             (df['policy'] == 'Static,policy=0') & 
             (df['probSize'] == 'largeprob') &
             (df['type'] == 'VA')])

print(df.loc[(df['progname'] == 'quicksilver') & 
             (df['type'] == 'VA')])


         progname   probSize           policy  trialnum    eteXtime type
1252  quicksilver  largeprob  Static,policy=0         0  114.653583   PA
1253  quicksilver  largeprob  Static,policy=0         1  116.636351   PA
1254  quicksilver  largeprob  Static,policy=0         2  118.252588   PA
1255  quicksilver  largeprob  Static,policy=0         3  121.551012   PA
1256  quicksilver  largeprob  Static,policy=0         4  117.921782   PA
1257  quicksilver  largeprob  Static,policy=0         5  117.019064   PA
1259  quicksilver  largeprob  Static,policy=0         6  117.324534   PA
1258  quicksilver  largeprob  Static,policy=0         7  116.032967   PA
1260  quicksilver  largeprob  Static,policy=0         8  121.267572   PA
1261  quicksilver  largeprob  Static,policy=0         9  121.502470   PA
         progname   probSize           policy  trialnum    eteXtime type
1242  quicksilver  largeprob  Static,policy=0         0  115.766770   VA
1244  quicksilver  largeprob  Static,policy=0      

In [None]:
# 