# Polybench/AEP Analaysis

The purpose of this notebook is to facilitate the evaluation of the Polybench 3.2 results and their corresponding AEP data.


In [1]:
#!/usr/bin/python3

import os
import matplotlib.pyplot as plt
import matplotlib.dates as matdates
from matplotlib2tikz import save as tikz_save
import numpy as np
import pandas as pd
from collections import OrderedDict


In [2]:
class PolybenchData:
    tests = []

    def __init__(self, log):
        self.log=log # save the name of the file we used
        self.log_name = os.path.basename(self.log)
        self.log_base = os.path.splitext(self.log_name)[0]
    

In [19]:
class AEPWatchData:
    
    per_row = ('epoch', 'timestamp')
    per_entry = ('bytes read', 'bytes written', 'read hit ratio',
            'write hit ratio', 'wdb merge percent',
            'sxp read ops', 'sxp write ops',
            'read 64B ops received', 'write 64B ops received',
            'ddrt read ops', 'ddrt write ops', 'block read ops', 'block write ops')
    linestyles = OrderedDict(
        [('solid',               (0, ())),
         ('loosely dotted',      (0, (1, 10))),
         ('dotted',              (0, (1, 5))),
         ('densely dotted',      (0, (1, 1))),

         ('loosely dashed',      (0, (5, 10))),
         ('dashed',              (0, (5, 5))),
         ('densely dashed',      (0, (5, 1))),

         ('loosely dashdotted',  (0, (3, 10, 1, 10))),
         ('dashdotted',          (0, (3, 5, 1, 5))),
         ('densely dashdotted',  (0, (3, 1, 1, 1))),

         ('loosely dashdotdotted', (0, (3, 10, 1, 10, 1, 10))),
         ('dashdotdotted',         (0, (3, 5, 1, 5, 1, 5))),
         ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1)))])   

    dimms=[]
    read_hit_ratio = []
    write_hit_ratio = []
    wdb_merge_percentage = []
    sxp_read = []
    sxp_write = []
    ddrt_read = []
    ddrt_write = []

    figdict = {
        # measurement       :  y-label,       column name,          description,                      Derived
        'ddrt read'         : ('operations', 'ddrt read ops',       'CPU read operations (64B)',      False),
        'ddrt write'        : ('operations', 'ddrt write ops',      'CPU write operations (64B)',     False),
        'sxp read ops'      : ('operations', 'sxp read ops',        'Controller read ops (64B)',      False),
        'sxp write ops'     : ('operations', 'sxp write ops',       'Controller write ops (64B)',     False),
        'read hit ratio'    : ('ratio',      'read hit ratio',      'Prefetcher read efficiency',     True),
        'write hit ratio'   : ('ratio',      'write hit ratio',     'Prefetcher write efficiency',    True),
        'write merge ratio' : ('ratio',      'wdb merge percent',   'Write merge efficiency',         True)
        # omitted:
        # cpu bytes read - this is ddrt * 64
        # cpu bytes written - this is ddrt * 64
        # pmem bytes read - this is sxp read ops * 64
        # pmem bytes written - this is sxp write ops * 64
        # read blocks - this is always zero
        # write blocks - this is always zero
    }
        
    
    def __init__(self, log, starting_module=6, ending_module=11):
        self.log=log
        self.log_name = os.path.basename(self.log)
        self.log_base = os.path.splitext(self.log_name)[0]
        self.columns = [x for x in self.per_row]
        for index in range(0, 12): # our systems have 12 PMEM modules
            self.dimms.append('DIMM{}'.format(index))
            self.ddrt_read.append('ddrt read ops.{}'.format(index))
            self.ddrt_write.append('ddrt write ops.{}'.format(index))
            self.read_hit_ratio.append('read hit ratio.{}'.format(index))
            self.write_hit_ratio.append('write hit ratio.{}'.format(index))
            self.wdb_merge_percentage.append('wdb merge percent.{}'.format(index))
            self.sxp_read.append('sxp read ops.{}'.format(index))
            self.sxp_write.append('sxp write ops.{}'.format(index))
            for e in self.per_entry:
                self.columns.append('{}.{}'.format(e, index))
        self.columns.append('unused')
        self._load_log()
        self.starting_module = starting_module
        self.ending_module = ending_module
        
    def _load_log(self):
        self.aep_data = pd.read_csv(self.log, skiprows=6, sep=';', header=None, names=self.columns)
        self.aep_data = self.aep_data.drop('unused', axis=1) # drop the empty last column       
        return self
    
    def get_aepdata(self):
        return self.aep_data
    
    def get_log_base_name(self):
        return self.log_base
    
    def plot_specific_entry(self, entry_name):
        times = list(self.aep_data['epoch'])
        if len(times) is 0: return # nothing to plot
        timestamps = [t - times[0] for t in times]
        
        plt.style.use('ggplot')
        plt.title(self.log_base)
        plt.ylabel(self.figdict[entry_name][2], fontsize="x-large")
        plt.xlabel('time (seconds)', fontsize="x-large")
        
        for index in range(self.starting_module, self.ending_module+1):
            plt.plot(timestamps, list(self.aep_data['{}.{}'.format(self.figdict[entry_name][1],index)]),
                    label=self.dimms[index], linestyle=linestyles[ls[index % len(ls)]], marker='o')

        plt.legend()
        plt.savefig('{}-{}.png'.format(self.log_base, entry_name))
        plt.show()

    
    def plot_data(self):
        for entry in self.figdict: self.plot_specific_entry(entry)


There are actually more log files than I need to process.  I'm going to process the last set of runs, which are all captured inside the `runall-*.log` files.  That allows me to capture the data set sizes I used, as well as then work on decomposing the results.  There's a LOT of data here and I need to distill it.

Unfortunately, I didn't explicitly mark the data set size.  So I have to rely upon the fact I know the ordering.  The order was:

extra large
large
standard
small

**Note**: actually, I store the data in the make log, so I _can_ compute it.

I create a separate directory for **each** of these individual sizes.  So there are 5 test runs x 4 directories per test run.

In [301]:
class PolybenchLogData:
    
    run_log_types = ('dram', 'pmem1', 'pmem7')
    make_log_types = ('make',)

   
    class MakeLogData:
        
        pb_options = ('POLYBENCH_TIME',
                      'POLYBENCH_NO_FLUSH_CACHE',
                      'POLYBENCH_LINUX_FIFO_SCHEDULER',
                      'POLYBENCH_CACHE_SIZE_KB', # not really parsing this
                      'POLYBENCH_STACK_ARRAYS',
                      'POLYBENCH_DUMP_ARRAYS',
                      'POLYBENCH_CYCLE_ACCURATE_TIMER',
                      'POLYBENCH_PAPI',
                      'POLYBENCH_USE_C99_PROTO',
                      'POLYBENCH_USE_SCALAR_LB',
                      )
        
        pb_datasets = ('MINI_DATASET', 
                       'SMALL_DATASET', 
                       'STANDARD_DATASET', 
                       'LARGE_DATASET',
                       'EXTRALARGE_DATASET',
                       )
        
        def __init__(self, makelog):
            self.makelog = makelog
            assert 'make' in makelog, "Unknown make log type {}".format(makelog)
            self.label = 'make'
            self.file_data = None
            self.options = {x : False for x in self.pb_options}
            self.dataset = self.pb_datasets[2]
            self.load_file()
            self.parse_options()
            assert not self.options['POLYBENCH_CACHE_SIZE_KB'], "Not handling this option"

        def label(self):
            '''Return the log data label'''
            return self.label
        
        def load_file(self):
            '''Load the makefile log'''
            with open(self.makelog, 'r') as fd:
                self.file_data = fd.readlines()
            return self
        
        def parse_options(self):
            """Scan the Make log to find the build options"""
            assert self.file_data is not None, "File data not loaded"
            #print(self.file_data[0])
            for line in self.file_data:
                for option in self.pb_options:
                    if option in line:
                        self.options[option] = True
                for ds in self.pb_datasets:
                    if ds in line: self.dataset = ds
            #print(self.options)
            #print(self.dataset)
            return self
        
        def get_options(self):
            """Get a list of the options for this Make log"""
            return [x for x in self.options if self.options[x]]
        
        def get_all_options(self):
            """Get a true/false list of the options present"""
            return self.options
        
        def get_dataset(self):
            """Get the data set used"""
            return self.dataset
                        
                
    class RunLogData:

        results_labels = ('Test', 'Allocation', 'Ticks')
        
        def __init__(self, runlog):
            self.runlog = runlog
            if 'dram' in runlog:
                self.label = 'dram'
            elif 'pmem1' in runlog:
                self.label = 'pmem1'
            elif 'pmem7' in runlog:
                self.label = 'pmem7'
            else: assert False, "Unknown run log type {}".format(runlog)
            self.log_data = []
            self.results = []
            self.load_file()
            self.parse_log()
            
                
        def label(self):
            """Get the dataset label"""
            return self.label
        
        
        def load_file(self):
            """Load the run log data"""
            # print(self.runlog)
            with open(self.runlog, 'r+t') as fd:
                self.log_data = fd.readlines()
            return self
        
        def parse_log(self):
            # print(self.runlog)
            #print(self.log_data[0])
            index = 0
            while index < len(self.log_data):
                test = self.log_data[index]
                index = index + 1
                alloc = []
                while 'allocate' in self.log_data[index]:
                    alloc.append(self.log_data[index])
                    index = index + 1
                ticks = int(self.log_data[index])
                index = index + 1
                # at this point we have data for one test run
                test = test.split('/')[-1].strip()[:-5]
                total_size = 0
                for al in alloc:
                    alloc_size = int(al.split(' ')[-1].strip())
                    total_size = total_size + alloc_size
                #print(ticks)
                self.results.append((test, total_size, ticks))
            #print(self.results)
            return self
          
        def get_results(self):
            return self.results
            
    
    class AEPData:
        
        def __init__(self, dir):
            self.dir = dir
            self.aep_logs = [x for x in os.listdir(dir) if x.endswith('.csv')]
            self.aep_data = {}
            self.load_logs()
            
        def load_logs(self):
            for al in self.aep_logs: 
                resname = os.path.splitext(al)[0]
                awd = AEPWatchData('{}/{}'.format(self.dir, al))
                self.aep_data[resname] = awd.get_aepdata()
                #print(self.aep_data[resname])
            return self       
 
                                                      
    def __init__(self, dir='.'):
        self.dir = dir
        self.start = 0
        self.logs = [x for x in os.listdir('.') if 'runall' in x and x.endswith('.log')]
        #print(self.logs)
        self.runlogs = {}
        for l in self.logs:
            lt = os.path.splitext(l)[0]
            lts = lt.split('-')
            lt = '{}-{}'.format(lts[1], lts[-1])
            self.runlogs[lt] = (l, self.parse_logs(l))
        #print(self.runlogs)
        self.dataframe = self.get_dataframe()

    def __iter__(self):
        return (self.runlogs[rl] for rl in self.runlogs)
            
    def load_logs(self, dir):
        logs = [x for x in os.listdir(dir) if x.endswith('.log')]
        return logs


    def parse_logs(self, log):
        runs = []
        with open(log) as fd:
            for line in fd.readlines():
                data={}
                if 'mkdir' not in line: continue # skip
                line = line.strip().split(' ')
                log_dir = line[1]
                logs = self.load_logs(log_dir) # second parameter is what directory we made
                #print(logs[0].split('-'))
                for l in logs:
                    found = False
                    for rlt in self.run_log_types:
                        if rlt in l: 
                            data[rlt] = self.RunLogData('{}/{}'.format(log_dir,l))
                            found = True
                            break
                    if found: continue
                    assert len(self.make_log_types) 
                    #print(self.make_log_types)
                    for mlt in self.make_log_types:
                        if mlt in l:
                            data[mlt] = self.MakeLogData('{}/{}'.format(log_dir,l))
                            found = True
                            break
                    if found: continue
                    assert False, "Unknown log type {}".format(l)
                runs.append(data)
                self.aepdata = self.AEPData(log_dir)
        return runs
        #print(runs)

        
    def load_log_data(label, log):
            with open(log, 'r') as fd:
                self.logdata[label] = readlines()
            return self
        
    def get_dataset_types(self):
        return (self.MakeLogData.pb_datasets)
    
    def get_pb_options(self):
        return (self.MakeLogData.pb_options)

    def get_log_types(self):
        return (self.run_log_types)
    
    def get_make_log_types(self):
        return (self.make_log_types)
    
    def get_dataframe(self):
        """Return dataframe"""
        if getattr(self, 'dataframe', None) is not None: return self.dataframe
        optlist_labels = [opt for opt in self.get_pb_options()]
        labels = ['Timestamp', 'Run', 'Dataset', 'Type']
        labels = labels + optlist_labels
        labels = tuple(labels + ['Test', 'Alloc', 'Ticks'])
        flat_data = []
        for rl in self.runlogs:
            _, results = self.runlogs[rl]
            timestamp, run = os.path.splitext(rl)[0].split('-')
            for res in results:
                dataset = res['make'].get_dataset()
                options = res['make'].get_all_options()
                optlist = [options[opt] for opt in optlist_labels]
                for lt in self.get_log_types():
                    for tests in res[lt].get_results():
                        data = [timestamp, int(run), dataset, lt] + optlist
                        test, alloc, ticks = tests
                        data = tuple(data + [test, alloc, ticks])
                        flat_data.append(data)
                        assert len(data) == len(labels), \
                            "Label count {} doesn't match data length {}".format(len(labels), len(data))
        self.dataframe = pd.DataFrame(flat_data, columns = labels) 
        return self.dataframe

logdata = PolybenchLogData()



In [302]:
# print(logdata.get_dataframe())

import statistics
from scipy.stats.mstats import gmean

df = logdata.get_dataframe()

def select_data(df, Dataset='EXTRALARGE_DATASET', Type='dram', Test='2mm'):
    return df.loc[(df['Dataset'] == Dataset) & (df['Type'] == Type) & (df['Test'] == Test)]

average_data = []
average_data_labels = ('Test', 'Dataset', 'Type', 'Alloc', 'Mean', 
                       'Harmonic Mean', 'Geometric Mean', 'Standard Deviation')
for test in df['Test'].unique():
    for d in df['Dataset'].unique():
        for t in df['Type'].unique():
            data = select_data(df, d, t, test)
            ticks = list(data['Ticks'])
            alloc = statistics.mean(list(data['Alloc']))
            ticks_mean = statistics.mean(ticks)
            ticks_hmean = statistics.harmonic_mean(ticks)
            ticks_gmean = gmean(ticks)
            ticks_stdev = statistics.stdev(ticks, ticks_mean)
            average_data.append((test, d, t, alloc, ticks_mean, ticks_hmean, ticks_gmean, ticks_stdev))

average_df = pd.DataFrame(average_data, columns=average_data_labels)

dram_baseline = average_df.loc[average_df['Type'] == 'dram']
#print(dram_baseline)
dram_value = average_df.loc[(average_df['Test'] == '3mm') & 
                            (average_df['Dataset'] == 'EXTRALARGE_DATASET') &
                            (average_df['Type'] == 'dram')]
#print(dram_value)
#print(list(average_df['Mean']))
#print(len(average_data))
normalized_mean = []
normalized_hmean = []
normalized_gmean = []
for index in range(len(average_data)):
    row = average_df.iloc[index,:]
    #print(row['Mean'])
    dram_value = average_df.loc[(average_df['Test'] == row['Test']) & 
                                (average_df['Dataset'] == row['Dataset']) &
                                (average_df['Type'] == 'dram')]
    #print(list(dram_value['Mean'])[0])
    nmean = row['Mean'] / list(dram_value['Mean'])[0]
    normalized_mean.append(nmean)
    hmean = row['Harmonic Mean'] / list(dram_value['Harmonic Mean'])[0]
    normalized_hmean.append(hmean)
    gmean = row['Geometric Mean'] / list(dram_value['Geometric Mean'])[0]
    normalized_gmean.append(gmean)
#print(len(normalized_mean), normalized_mean)
#print(len(normalized_hmean), normalized_hmean)
#print(len(normalized_gmean), normalized_gmean)
average_df['Normalized Mean'] = normalized_mean
average_df['Normalized Harmonic Mean'] = normalized_hmean
average_df['Normalized Geometric Mean'] = normalized_gmean
#print(average_df)

pmem1_faster = average_df.loc[(average_df['Normalized Mean'] < 1.0) & (average_df['Type'] == 'pmem1')]
print(pmem1_faster)
pmem7_faster = average_df.loc[(average_df['Normalized Mean'] < 1.0) & (average_df['Type'] == 'pmem7')]
print(pmem7_faster)
pmem1_slower = average_df.loc[(average_df['Normalized Mean'] > 1.0) & (average_df['Type'] == 'pmem1')]
print(pmem1_slower)
pmem7_slower = average_df.loc[(average_df['Normalized Mean'] > 1.0) & (average_df['Type'] == 'pmem7')]
print(pmem7_slower)
#print(len(df['Test'].unique()))



#print(df.loc[(df['Dataset'] == 'LARGE_DATASET') & (df['Type'] == 'dram')])
#ticks = select_data(df)['Ticks']
#ticks_mean = statistics.mean(ticks)
#ticks_hmean = statistics.harmonic_mean(ticks)
#ticks_gmean = gmean(ticks)
#ticks_stdev = statistics.stdev(ticks, ticks_mean)
#print(ticks_mean)
#print(ticks_hmean)
#print(ticks_gmean)
#print(ticks_stdev)
#print(ticks_stdev/ticks_mean)



                Test             Dataset   Type    Alloc         Mean  \
22               3mm       SMALL_DATASET  pmem1   917504   18001468.0   
55          cholesky    STANDARD_DATASET  pmem1  8396800  417490207.6   
58          cholesky       SMALL_DATASET  pmem1   132096    1745128.8   
70           doitgen       SMALL_DATASET  pmem1   532480    2541983.2   
82              gemm       SMALL_DATASET  pmem1   393216    5887008.8   
130             symm       SMALL_DATASET  pmem1   393216    6081858.0   
142            syr2k       SMALL_DATASET  pmem1   393216   10265664.0   
154             syrk       SMALL_DATASET  pmem1   262144    4964013.6   
178             trmm       SMALL_DATASET  pmem1   262144    2276978.4   
199          dynprog    STANDARD_DATASET  pmem1   520000  346917419.6   
214      gramschmidt       SMALL_DATASET  pmem1   393216    7313840.8   
226               lu       SMALL_DATASET  pmem1   131072     592578.4   
235           ludcmp    STANDARD_DATASET  pmem1  84

In [304]:
optlist_labels = [opt for opt in logdata.get_pb_options()]
labels = ['Timestamp', 'Run', 'Dataset', 'Type']
labels = labels + optlist_labels
labels = tuple(labels + ['Test', 'Alloc', 'Ticks'])
#print(labels)

flat_data = []
for run, results in logdata:
    _, timestamp, _, run = os.path.splitext(run)[0].split('-')
    for res in results:
        dataset = res['make'].get_dataset()
        options = res['make'].get_all_options()
        optlist = [options[opt] for opt in optlist_labels]
        for lt in logdata.get_log_types():
            for tests in res[lt].get_results():
                data = [timestamp, int(run), dataset, lt] + optlist
                test, alloc, ticks = tests
                data = tuple(data + [test, alloc, ticks])
                flat_data.append(data)
                assert len(data) == len(labels), \
                    "Label count {} doesn't match data length {}".format(len(labels), len(data))
# print(flat_data)

df = pd.DataFrame(flat_data, columns = labels) 
print(df)

                 Timestamp  Run             Dataset   Type  POLYBENCH_TIME  \
0     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
1     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
2     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
3     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
4     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
5     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
6     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
7     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
8     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
9     2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
10    2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram            True   
11    2019_06_20__00_42_09    1  EXTRALARGE_DATASET   dram      

In [305]:
datasets = {}
for run, rundata in logdata:
    for index in range(len(rundata)):
        dst = rundata[index]['make'].get_dataset()
        if dst not in datasets: datasets[dst] = {}
        for lt in logdata.get_log_types():
            if lt in rundata[index]:
                if lt not in datasets[dst]: datasets[dst][lt] = []
                datasets[dst][lt].append(rundata[index][lt])

merged_data = {ds : {} for ds in datasets}
for ds in datasets:
    merged_data[ds] = {lt: {} for lt in datasets[ds]}
    for lt in datasets[ds]:
        for results in datasets[ds][lt]:
            total_results = merged_data[ds][lt]
            for res in results.get_results():
                test, size, ticks = res
                if test not in total_results:
                    total_results[test] = {'size': 0, 'ticks': 0, 'count': 0}
                total_results[test]['size'] = total_results[test]['size'] + size
                total_results[test]['ticks'] = total_results[test]['ticks'] + ticks
                total_results[test]['count'] = total_results[test]['count'] + 1
            merged_data[ds][lt] = total_results

average_data = {ds : {} for ds in merged_data}
for ds in merged_data:
    average_data[ds] = {lt: {} for lt in merged_data[ds]}
    for lt in merged_data[ds]:
        average_data[ds][lt] = {}
        for test in merged_data[ds][lt]:
            results = merged_data[ds][lt][test]
            size = results['size'] / results['count']
            ticks = float(results['ticks']) / float(results['count'])
            average_data[ds][lt][test] = {'size' : size, 'ticks' : ticks}
            
# create a flat version of this data
flat_data = []
for ds in average_data:
    for lt in average_data[ds]:
        for test in average_data[ds][lt]:
            size = average_data[ds][lt][test]['size']
            ticks = average_data[ds][lt][test]['ticks']
            flat_data.append((ds, lt, test, size, ticks))
            
print(flat_data)
#for run, rundata in logdata:
#    print(run)
#    for index in range(len(rundata)):
#        datasets[rundata[index]['make'].get_dataset()].append(rundata)

#for ds in datasets: print(datasets[ds])

    
#for lt in PolybenchLogData.run_log_types:
#    print(lt)

[('EXTRALARGE_DATASET', 'dram', '2mm', 640000000.0, 1762778099711.6), ('EXTRALARGE_DATASET', 'dram', '3mm', 896000000.0, 2640782292812.0), ('EXTRALARGE_DATASET', 'dram', 'atax', 80002400000.0, 47500745497.2), ('EXTRALARGE_DATASET', 'dram', 'bicg', 80003200000.0, 56846960766.8), ('EXTRALARGE_DATASET', 'dram', 'cholesky', 128032000.0, 26430779375.6), ('EXTRALARGE_DATASET', 'dram', 'doitgen', 16008000000.0, 3417700156393.2), ('EXTRALARGE_DATASET', 'dram', 'gemm', 384000000.0, 884200778442.0), ('EXTRALARGE_DATASET', 'dram', 'gemver', 80006400000.0, 404668345536.8), ('EXTRALARGE_DATASET', 'dram', 'gesummv', 1024192000.0, 365996908.0), ('EXTRALARGE_DATASET', 'dram', 'mvt', 80003200000.0, 364323223533.2), ('EXTRALARGE_DATASET', 'dram', 'symm', 384000000.0, 1533045982797.6), ('EXTRALARGE_DATASET', 'dram', 'syr2k', 384000000.0, 327721390001.6), ('EXTRALARGE_DATASET', 'dram', 'syrk', 256000000.0, 189601267573.6), ('EXTRALARGE_DATASET', 'dram', 'trisolv', 80001600000.0, 15244868063.2), ('EXTRALAR

In [306]:

def print_test_size(df, test, size):
    sd = select_data(df, Dataset=size, Type='dram', Test=test)
    try:
        return (test, size, int(sd['Alloc']))
    except Exception as e:
        print('failed with {} ({}): {}'.format(test, size, sd['Alloc']))
    return (test, size, 0)

comerge_map = [
    ('gramschmidt', 'LARGE_DATASET'),
    ('correlation', 'EXTRALARGE_DATASET'),
    ('mvt', 'EXTRALARGE_DATASET'),
    ('gemm', 'LARGE_DATASET'),
    ('symm', 'LARGE_DATASET'),
    ('fdtd-2d', 'EXTRALARGE_DATASET'),
    ('gemver', 'EXTRALARGE_DATASET'),
    ('durbin', 'EXTRALARGE_DATASET'),
    ('trisolv', 'EXTRALARGE_DATASET'),
    ('adi', 'EXTRALARGE_DATASET'),
    ('atax', 'EXTRALARGE_DATASET'),
    ('fdtd-apml', 'LARGE_DATASET'),
    ('seidel-2d', 'EXTRALARGE_DATASET'),
    ('doitgen', 'LARGE_DATASET'),
    ('ludcmp', 'LARGE_DATASET'),
    ('bicg', 'LARGE_DATASET'),
    ('syr2k', 'LARGE_DATASET'),
    ('3mm', 'LARGE_DATASET'),
    ('cholesky', 'EXTRALARGE_DATASET'),
    ('jacobi-2d-imper', 'EXTRALARGE_DATASET'),
    ('syrk', 'LARGE_DATASET'),
    ('gesummv', 'EXTRALARGE_DATASET'),
    ('trmm', 'EXTRALARGE_DATASET'),
    ('floyd-warshall', 'EXTRALARGE_DATASET'),
    ('covariance', 'STANDARD_DATASET'),
    ('2mm', 'LARGE_DATASET'),
    ('dynprog', 'LARGE_DATASET'),
    ('lu', 'LARGE_DATASET'),
    ('reg_detect', 'LARGE_DATASET'),
    ('jacobi-1d-imper', 'EXTRALARGE_DATASET'),
]

print(len(comerge_map))

def convert_size(size):
    original_size = size
    size = float(size)
    map = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
    bound = 1024
    for index in range(len(map)):
        if size < bound:
            return (size, map[index])
        size = size / bound
    return (original_size, map[0])

for cm in comerge_map:
    test, size, alloc = print_test_size(average_df, cm[0], cm[1])
    alloc_value, alloc_units = convert_size(alloc)
    print('{} ({}): {} -> {:.1f}{}'.format(test, size, alloc, alloc_value, alloc_units))

#print(print_test_size(average_df, 'gramschmidt', 'LARGE_DATASET'))

test_sizes = [1024**index for index in range(10)]
print(test_sizes)
for t in test_sizes:
    print('{} -> {}', t, convert_size(t))


30
gramschmidt (LARGE_DATASET): 96000000 -> 91.6MB
correlation (EXTRALARGE_DATASET): 256064000 -> 244.2MB
mvt (EXTRALARGE_DATASET): 80003200000 -> 74.5GB
gemm (LARGE_DATASET): 96000000 -> 91.6MB
symm (LARGE_DATASET): 96000000 -> 91.6MB
fdtd-2d (EXTRALARGE_DATASET): 384000800 -> 366.2MB
gemver (EXTRALARGE_DATASET): 80006400000 -> 74.5GB
durbin (EXTRALARGE_DATASET): 1024256000 -> 976.8MB
trisolv (EXTRALARGE_DATASET): 80001600000 -> 74.5GB
adi (EXTRALARGE_DATASET): 384000000 -> 366.2MB
atax (EXTRALARGE_DATASET): 80002400000 -> 74.5GB
fdtd-apml (LARGE_DATASET): 4328628336 -> 4.0GB
seidel-2d (EXTRALARGE_DATASET): 128000000 -> 122.1MB
doitgen (LARGE_DATASET): 268959744 -> 256.5MB
ludcmp (LARGE_DATASET): 32080032 -> 30.6MB
bicg (LARGE_DATASET): 512256000 -> 488.5MB
syr2k (LARGE_DATASET): 96000000 -> 91.6MB
3mm (LARGE_DATASET): 224000000 -> 213.6MB
cholesky (EXTRALARGE_DATASET): 128032000 -> 122.1MB
jacobi-2d-imper (EXTRALARGE_DATASET): 256000000 -> 244.1MB
syrk (LARGE_DATASET): 64000000 -> 61

In [333]:
# print(average_df)
datasize = {}
for cm in comerge_map:
    for dset in ('SMALL_DATASET', 'STANDARD_DATASET', 'LARGE_DATASET', 'EXTRALARGE_DATASET'):
        test, size, alloc = print_test_size(average_df, cm[0], dset)
        assert size == dset, 'Returned size {} does not match requested size {}'.format(size, dset)
        if test not in datasize: datasize[test] = {}
        alloc_value, alloc_units = convert_size(alloc)
        datasize[test][size] = '{:.1f}{}'.format(alloc_value, alloc_units)

print('{:10}\t{:10}\t{:10}\t{:10}\t{}'.format('Test', 'Small', 'Standard', 'Large', 'Extra Large'))
for test in datasize: 
    print('{:10}\t{:10}\t{:10}\t{:10}\t{}'.format(test,
                                        datasize[test]['SMALL_DATASET'],
                                        datasize[test]['STANDARD_DATASET'],
                                        datasize[test]['LARGE_DATASET'],
                                        datasize[test]['EXTRALARGE_DATASET']                                        
                                       ))



Test      	Small     	Standard  	Large     	Extra Large
gramschmidt	384.0KB   	6.0MB     	91.6MB    	366.2MB
correlation	3.8MB     	15.3MB    	61.1MB    	244.2MB
mvt       	1.9MB     	122.2MB   	488.5MB   	74.5GB
gemm      	384.0KB   	24.0MB    	91.6MB    	366.2MB
symm      	384.0KB   	24.0MB    	91.6MB    	366.2MB
fdtd-2d   	5.7MB     	22.9MB    	91.6MB    	366.2MB
gemver    	1.9MB     	122.3MB   	488.8MB   	74.5GB
durbin    	3.8MB     	244.3MB   	976.8MB   	976.8MB
trisolv   	1.9MB     	122.1MB   	488.4MB   	74.5GB
adi       	5.7MB     	24.0MB    	91.6MB    	366.2MB
atax      	1.9MB     	122.2MB   	488.5MB   	74.5GB
fdtd-apml 	8.5MB     	520.1MB   	4.0GB     	29.9GB
seidel-2d 	1.9MB     	7.6MB     	30.5MB    	122.1MB
doitgen   	520.0KB   	32.1MB    	256.5MB   	14.9GB
ludcmp    	133.0KB   	8.0MB     	30.6MB    	122.2MB
bicg      	1.9MB     	122.2MB   	488.5MB   	74.5GB
syr2k     	384.0KB   	24.0MB    	91.6MB    	366.2MB
3mm       	896.0KB   	56.0MB    	213.6MB   	854.5MB
cholesky  	12

One of the open questions has been "why do the values for some of these deviate from what I measure using the _standard_ Polybench suite?"

So first, let's walk through and identify those that do match:

gramschmidt - Matches LARGE_DATASET
gemm - Matches LARGE_DATASET
symm - Matches LARGE_DATASET
fdtd-apml - Matches STANDARD_DATASET
syr2k - Matches LARGE_DATASET
cholesky - Matches EXTRA_LARGE
trmm - Matches EXTRA_LARGE
floyd-warshall - Matches EXTRA_LARGE
lu - Matches EXTRA_LARGE

correlations - CoMerge has 137.4MB - found the header file sizes were changed.
mvt - CoMerge has 1.7GB - found the header file sizes were changed.
fdtd-2d - CoMerge has 1.4GB - found the header file sizes were changed.
gemver - CoMerge has 1.7GB - found the header file sizes were changed.
durbin - CoMerge has 1.5GB
trisolv -
adi - 
atax - 
seidel-2d -
doitgen - 
ludcmp - 
bicg - 
3mm - 
jacobi-2d-imper - 
syrk - 
gesummv -
covariance - 
2mm - 
dynprog - 
reg_detect - 
jacobi-1d-imper -



In [300]:
# mvt, direct from trace: print(sum([512000000,64000,64000,64000,64000]))
# this isn't 372.5GB
size = sum([80000000000,800000,800000,800000,800000])
print(convert_size(size))
print(convert_size(400000000000))
mvt_data = select_data(df, Dataset='EXTRALARGE_DATASET', Type='dram', Test='mvt')

print(mvt_data)


(74.50878620147705, 'GB')
(372.5290298461914, 'GB')
                 Timestamp  Run             Dataset  Type  POLYBENCH_TIME  \
9     2019_06_20__00_42_09    1  EXTRALARGE_DATASET  dram            True   
369   2019_06_20__00_42_09    2  EXTRALARGE_DATASET  dram            True   
729   2019_06_20__00_42_09    3  EXTRALARGE_DATASET  dram            True   
1089  2019_06_20__00_42_09    4  EXTRALARGE_DATASET  dram            True   
1449  2019_06_20__00_42_09    5  EXTRALARGE_DATASET  dram            True   

      POLYBENCH_NO_FLUSH_CACHE  POLYBENCH_LINUX_FIFO_SCHEDULER  \
9                        False                            True   
369                      False                            True   
729                      False                            True   
1089                     False                            True   
1449                     False                            True   

      POLYBENCH_CACHE_SIZE_KB  POLYBENCH_STACK_ARRAYS  POLYBENCH_DUMP_ARRAYS  \
9       

# Code beyond this point is not used in this notebook

I have left it here for reference

In [None]:
data_set_sizes = ('extra large', 'large', 'standard', 'small')

def load_run_logs(dir):
    logs = [x for x in os.listdir(dir) if x.endswith('.log')]
    return logs

def parse_make_log(makelog):
    print(makelog)
    
def parse_data_log(datalog):
    print(datalog)

def load_runall_log(log):
    runs = []
    with open(log) as fd:
        for line in fd.readlines():
            data={}
            if 'mkdir' not in line: continue # skip
            line = line.strip().split(' ')
            logs = load_run_logs(line[1]) # second parameter is what directory we made
            print(logs[0].split('-'))
            for l in logs:
                if 'dram' in l: data['dram'] = l
                elif 'pmem1' in l: data['pmem1'] = l
                elif 'pmem7' in l: data['pmem7'] = l
                elif 'make' in l: data['make'] = l
                else: assert False, "Unknown log type {}".format(l)
            runs.append(data)
    return runs




In [None]:
pb_results_dirs = [x for x in os.listdir('.') if 'pb-results' in x]
pb_results = {}
for rd in pb_results_dirs:
    aep_logs = [x for x in os.listdir(rd) if 'aep-' in x]
    pb_logs = [x for x in os.listdir(rd) if x.endswith('.log')]
    pb_results[rd] = {'aep' : aep_logs, 'logs': pb_logs}


In [None]:
'''
Default Format
timestamp;;DIMM0;;;;;;;;;;;;;DIMM1;;;;;;;;;;;;;DIMM2;;;;;;;;;;;;;DIMM3;;;;;;;;;;;;;DIMM4;;;;;;;;;;;;;DIMM5;;;;;;;;;;;;;DIMM6;;;;;;;;;;;;;DIMM7;;;;;;;;;;;;;DIMM8;;;;;;;;;;;;;DIMM9;;;;;;;;;;;;;DIMM10;;;;;;;;;;;;;DIMM11;;;;;;;;;;;;;
epoch;timestamp;bytes_read (derived);bytes_written (derived);read_hit_ratio (derived);write_hit_ratio (derived);wdb_merge_percent (derived);sxp_read_ops (derived);sxp_write_ops 
'''
columns = ['column 1', 'column 2']
per_entry = ('epoch', 'timestamp', 'bytes read', 'bytes written', 'read hit ratio',
        'write hit ratio', 'wdb merge percent',
        'sxp read ops', 'sxp write ops',
        'read 64B ops received', 'write 64B ops received',
        'ddrt read ops', 'ddrt write ops')
for index in range(0,12):
    dimm_label='DIMM{}'.format(index)
    for e in per_entry:
        columns.append('{}:{}'.format(dimm_label, e))
columns.append('unused')

gs_data = pd.read_csv("log.csv", skiprows=6, sep=';', header=None, names=columns)
gs_data.drop('unused', axis=1)

data = {}
for index in range(0,12):
    dimm_label = 'DIMM{}'.format(index)
    start = 2 + (index * len(per_entry))
    end = 2 + ((index + 1) * len(per_entry))
    data[dimm_label] = gs_data.iloc[:, start:end]
    data[dimm_label].columns = list(per_entry)
    data[dimm_label].sort_values(by='timestamp', kind='mergesort', ascending=True,inplace=True)
    data[dimm_label]

def plot_specific_entry(data, test_name, entry_name):
    plt.style.use("ggplot")
    plt.title('{} {}'.format(test_name, entry_name))
    seclocator = matdates.SecondLocator(bysecond=[30]) 
    minlocator = matdates.MinuteLocator(byminute=range(60))  # range(60) is the default
    majorFmt = matdates.DateFormatter('%Y-%m-%d, %H:%M:%S')  
    minorFmt = matdates.DateFormatter('%H:%M:%S')
    for dimm in range(6,12):
        dimm_label = 'DIMM{}'.format(dimm)
        timestamps = data[dimm_label]['timestamp']
        plt.plot(data[dimm_label]['timestamp'], data[dimm_label][entry_name], label=dimm_label)
        # print(data[dimm_label][entry_name])
    plt.legend()
    plt.show()

for index in range(2, len(per_entry)):
    plot_specific_entry(data, 'ranjan', per_entry[index])


In [None]:
logdir = 'aepwatch_logs'
logs = ['{}/{}'.format(logdir, x) for x in os.listdir('aepwatch_logs')]
print(logs)

In [None]:
class AEPWatchData:
    
    per_row = ('epoch', 'timestamp')
    per_entry = ('bytes read', 'bytes written', 'read hit ratio',
            'write hit ratio', 'wdb merge percent',
            'sxp read ops', 'sxp write ops',
            'read 64B ops received', 'write 64B ops received',
            'ddrt read ops', 'ddrt write ops', 'block read ops', 'block write ops')
    linestyles = OrderedDict(
        [('solid',               (0, ())),
         ('loosely dotted',      (0, (1, 10))),
         ('dotted',              (0, (1, 5))),
         ('densely dotted',      (0, (1, 1))),

         ('loosely dashed',      (0, (5, 10))),
         ('dashed',              (0, (5, 5))),
         ('densely dashed',      (0, (5, 1))),

         ('loosely dashdotted',  (0, (3, 10, 1, 10))),
         ('dashdotted',          (0, (3, 5, 1, 5))),
         ('densely dashdotted',  (0, (3, 1, 1, 1))),

         ('loosely dashdotdotted', (0, (3, 10, 1, 10, 1, 10))),
         ('dashdotdotted',         (0, (3, 5, 1, 5, 1, 5))),
         ('densely dashdotdotted', (0, (3, 1, 1, 1, 1, 1)))])   

    dimms=[]
    read_hit_ratio = []
    write_hit_ratio = []
    wdb_merge_percentage = []
    sxp_read = []
    sxp_write = []
    ddrt_read = []
    ddrt_write = []

    figdict = {
        # measurement       :  y-label,       column name,          description,                      Derived
        'ddrt read'         : ('operations', 'ddrt read ops',       'CPU read operations (64B)',      False),
        'ddrt write'        : ('operations', 'ddrt write ops',      'CPU write operations (64B)',     False),
        'sxp read ops'      : ('operations', 'sxp read ops',        'Controller read ops (64B)',      False),
        'sxp write ops'     : ('operations', 'sxp write ops',       'Controller write ops (64B)',     False),
        'read hit ratio'    : ('ratio',      'read hit ratio',      'Prefetcher read efficiency',     True),
        'write hit ratio'   : ('ratio',      'write hit ratio',     'Prefetcher write efficiency',    True),
        'write merge ratio' : ('ratio',      'wdb merge percent',   'Write merge efficiency',         True)
        # omitted:
        # cpu bytes read - this is ddrt * 64
        # cpu bytes written - this is ddrt * 64
        # pmem bytes read - this is sxp read ops * 64
        # pmem bytes written - this is sxp write ops * 64
        # read blocks - this is always zero
        # write blocks - this is always zero
    }
        
    
    def __init__(self, log, starting_module=6, ending_module=11):
        self.log=log
        self.log_name = os.path.basename(self.log)
        self.log_base = os.path.splitext(self.log_name)[0]
        self.columns = [x for x in self.per_row]
        for index in range(0, 12): # our systems have 12 PMEM modules
            self.dimms.append('DIMM{}'.format(index))
            self.ddrt_read.append('ddrt read ops.{}'.format(index))
            self.ddrt_write.append('ddrt write ops.{}'.format(index))
            self.read_hit_ratio.append('read hit ratio.{}'.format(index))
            self.write_hit_ratio.append('write hit ratio.{}'.format(index))
            self.wdb_merge_percentage.append('wdb merge percent.{}'.format(index))
            self.sxp_read.append('sxp read ops.{}'.format(index))
            self.sxp_write.append('sxp write ops.{}'.format(index))
            for e in self.per_entry:
                self.columns.append('{}.{}'.format(e, index))
        self.columns.append('unused')
        self._load_log()
        self.starting_module = starting_module
        self.ending_module = ending_module
        
    def _load_log(self):
        self.aep_data = pd.read_csv(self.log, skiprows=6, sep=';', header=None, names=self.columns)
        self.aep_data = self.aep_data.drop('unused', axis=1) # drop the empty last column       
        return self
    
    def get_log_base_name(self):
        return self.log_base
    
    def plot_specific_entry(self, entry_name):
        times = list(self.aep_data['epoch'])
        if len(times) is 0: return # nothing to plot
        timestamps = [t - times[0] for t in times]
        
        plt.style.use('ggplot')
        plt.title(self.log_base)
        plt.ylabel(self.figdict[entry_name][2], fontsize="x-large")
        plt.xlabel('time (seconds)', fontsize="x-large")
        
        for index in range(self.starting_module, self.ending_module+1):
            plt.plot(timestamps, list(self.aep_data['{}.{}'.format(self.figdict[entry_name][1],index)]),
                    label=self.dimms[index], linestyle=linestyles[ls[index % len(ls)]], marker='o')

        plt.legend()
        plt.savefig('{}-{}.png'.format(self.log_base, entry_name))
        plt.show()

    
    def plot_data(self):
        for entry in self.figdict: self.plot_specific_entry(entry)


In [None]:
for l in logs:
    aepdata = AEPWatchData(l)
    aepdata.plot_data()
