In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import time
import os
# import pickle
import dill as pickle  # Use dill instead of pickle

---
# some general use functions before moving on...

In [2]:
def set_category(x, cat_list, cat_size, ranges_size_flag=False):
    for index in range(len(cat_list)):
        cat = cat_list[index]
        cat_min = float(cat.strip('[').strip(']').split('-')[0])
        cat_max = float(cat.strip('[').strip(']').split('-')[1])
        # print(cat_min, cat_max,'->\t->', x)
        if(x>cat_min and x<=cat_max):
            if(ranges_size_flag==True):
                return cat_size[index]
            else:
                return cat_list[index]

def set_category2(x, cat_list, cat_size, ranges_size_flag=False):
    for index in range(len(cat_list)):
        cat = cat_list[index]
        if(x==cat):
            return cat_size[index]

# write all available devices here
ranges_dev = [
                'NVIDIA-P100', 'NVIDIA-V100', 'NVIDIA-A100', 'NVIDIA-H100', 'AMD-MI250',
                'AMD-EPYC-24', 'AMD-EPYC-64',
                'INTEL-XEON-14', 'INTEL-ICY-16', 'INTEL-SAPPHIRE-56',
                'ARM-NEON-80', 'ARM-GRACE-72',
                'IBM-POWER9-32'
             ]
ranges_dev_backup = ranges_dev

# this is used to set limits on the y-axis for each device separately
y_limit_dictionary = {
    'NVIDIA-P100' : [0, 100],
    'NVIDIA-V100' : [0, 140],
    'NVIDIA-A100' : [0, 230],
    'NVIDIA-H100' : [0, 630],
    'AMD-MI250' : [0, 240],
   
    'AMD-EPYC-64' : [0, 240],
    'AMD-EPYC-24' : [0, 110],

    'INTEL-XEON-14' : [0, 50],
    'INTEL-ICY-16' : [0, 70],
    'INTEL-SAPPHIRE-56' : [0, 320],

    'ARM-NEON-80' : [0, 190],
    'ARM-GRACE-72' : [0, 280],
   
    'IBM-POWER9-32' : [0, 55],

    # 'Alveo-U280' : [0, 30]
}

llc_thresholds = {
    'AMD-EPYC-64' : 256,
    'AMD-EPYC-24' : 128,

    'INTEL-XEON-14' : 19.25,
    'INTEL-ICY-16' : 24,
    'INTEL-SAPPHIRE-56' : 105,

    'ARM-NEON-80' : 80,
    'ARM-GRACE-72' : 114,
   
    'IBM-POWER9-32' : 80,
}

# need to write them all down here, so that they appear in this specific order when printing them per device...
ranges_impl_arch = [         
    '( NVIDIA-P100 )\tcu-COO', '( NVIDIA-P100 )\tcu-CSR', '( NVIDIA-P100 )\tcu-HYB',
    '( NVIDIA-P100 )\tCSR5',

    '( NVIDIA-V100 )\tcu-COO', '( NVIDIA-V100 )\tcu-CSR', '( NVIDIA-V100 )\tcu-HYB',
    '( NVIDIA-V100 )\tCSR5',

    '( NVIDIA-A100 )\tcu-COO', '( NVIDIA-A100 )\tcu-CSR',
    '( NVIDIA-A100 )\tMerge-CSR', '( NVIDIA-A100 )\tSELL-C-σ',

    '( NVIDIA-H100 )\tcu-CSR', '( NVIDIA-H100 )\tcu-COO',
    '( NVIDIA-H100 )\tVec-CSR', '( NVIDIA-H100 )\tAda-CSR', '( NVIDIA-H100 )\tCustom-CSR',
    '( NVIDIA-H100 )\tCSR5', '( NVIDIA-H100 )\tDASP', '( NVIDIA-H100 )\tMerge-CSR',

    '( AMD-MI250 )\troc-CSR', '( AMD-MI250 )\troc-COO', '( AMD-MI250 )\troc-HYB',
    '( AMD-MI250 )\tVec-CSR', '( AMD-MI250 )\tAda-CSR', '( AMD-MI250 )\tCustom-CSR',
    '( AMD-MI250 )\tACC-Line', '( AMD-MI250 )\tACC-Flat',

    '( AMD-EPYC-24 )\tMKL-IE', '( AMD-EPYC-24 )\tAOCL',
    '( AMD-EPYC-24 )\tNaive-CSR', '( AMD-EPYC-24 )\tVec-CSR',  # '( AMD-EPYC-24 )\tVec-Bal-CSR',
    '( AMD-EPYC-24 )\tCSR5', '( AMD-EPYC-24 )\tSparseX', '( AMD-EPYC-24 )\tMerge-CSR', '( AMD-EPYC-24 )\tSELL-C-σ',
   
    '( AMD-EPYC-64 )\tMKL-IE', '( AMD-EPYC-64 )\tAOCL',
    '( AMD-EPYC-64 )\tNaive-CSR', '( AMD-EPYC-64 )\tBal-CSR', '( AMD-EPYC-64 )\tVec-CSR',
    '( AMD-EPYC-64 )\tCSR5', '( AMD-EPYC-64 )\tSparseX', '( AMD-EPYC-64 )\tMerge-CSR', '( AMD-EPYC-64 )\tSELL-C-σ', '( AMD-EPYC-64 )\tLCM',

    '( INTEL-XEON-14 )\tMKL-IE',
    '( INTEL-XEON-14 )\tNaive-CSR', '( INTEL-XEON-14 )\tVec-CSR',
    '( INTEL-XEON-14 )\tCSR5', '( INTEL-XEON-14 )\tSparseX', '( INTEL-XEON-14 )\tMerge-CSR', '( INTEL-XEON-14 )\tSELL-C-σ', 

    '( INTEL-ICY-16 )\tMKL-IE',
    '( INTEL-ICY-16 )\tNaive-CSR', '( INTEL-ICY-16 )\tVec-CSR',
    '( INTEL-ICY-16 )\tCSR5', '( INTEL-ICY-16 )\tSparseX', '( INTEL-ICY-16 )\tMerge-CSR', '( INTEL-ICY-16 )\tSELL-C-σ',

    '( INTEL-SAPPHIRE-56 )\tMKL-IE', # '( INTEL-SAPPHIRE-56 )\tAOCL',
    '( INTEL-SAPPHIRE-56 )\tNaive-CSR', '( INTEL-SAPPHIRE-56 )\tBal-CSR', '( INTEL-SAPPHIRE-56 )\tVec-CSR',
    '( INTEL-SAPPHIRE-56 )\tCSR5', '( INTEL-SAPPHIRE-56 )\tSparseX', '( INTEL-SAPPHIRE-56 )\tMerge-CSR', '( INTEL-SAPPHIRE-56 )\tSELL-C-σ', '( INTEL-SAPPHIRE-56 )\tLCM',

    '( ARM-NEON-80 )\tARM-lib',
    '( ARM-NEON-80 )\tNaive-CSR',
    '( ARM-NEON-80 )\tSparseX', '( ARM-NEON-80 )\tMerge-CSR', '( ARM-NEON-80 )\tSELL-C-σ',

    '( ARM-GRACE-72 )\tARM-lib',
    '( ARM-GRACE-72 )\tNaive-CSR', '( ARM-GRACE-72 )\tBal-CSR', '( ARM-GRACE-72 )\tVec-CSR',
    '( ARM-GRACE-72 )\tSparseX', '( ARM-GRACE-72 )\tMerge-CSR',

    '( IBM-POWER9-32 )\tNaive-CSR', '( IBM-POWER9-32 )\tBal-CSR', '( IBM-POWER9-32 )\tVec-CSR',
    '( IBM-POWER9-32 )\tSparseX', '( IBM-POWER9-32 )\tMerge-CSR',
]
ranges_impl_arch_backup = ranges_impl_arch
def filter_ranges_impl_arch(ranges_impl_arch_backup, groupdata, ranges_dev):
    return [x for x in ranges_impl_arch_backup if
            ((x.split(' )')[0].split('( ')[1] in ranges_dev) and
             (x.split('\t')[1] in set(groupdata[groupdata['System']==x.split(' )')[0].split('( ')[1]]['format_name'])))
           ]

def calculate_format_wins(ranges_impl_arch, groupdata, ranges_dev):
    wins = []
    for sys in ranges_dev:
        # print('---\n',sys)
        groupdata_sys = groupdata[groupdata['System']==sys]
        sys_shape = groupdata_sys.shape[0]
        for impl_arch_curr in ranges_impl_arch:
            if(sys in impl_arch_curr):
                impl = impl_arch_curr.split('\t')[1]
                group_sys_impl = groupdata_sys[groupdata_sys['format_name']==impl]
                sys_impl_shape = group_sys_impl.shape[0]
                perc = np.round(sys_impl_shape/sys_shape*100,2)
                # print(impl, '\t', perc, '%')
                if(perc==100): # This was for the FPGA kernel, which was only 1
                    perc = 0
                wins.append(perc)
    return wins

# These ranges_* lists here define the ranges for each feature, that will be used to plot later...
ranges_memr = ['[4-8]','[8-16]','[16-32]','[32-64]','[64-128]','[128-256]','[256-512]','[512-1024]','[1024-2048]'] # mem_footprint

# ranges_anr = ['[0-20]', '[20-75]','[75-150]', '[150-510]'] # avg_nnz_per_row
# ranges_anr = ['[0-15]', '[15-40]', '[40-75]','[75-150]', '[150-510]'] # avg_nnz_per_row
# ranges_anr = ['[0-10]', '[10-50]', '[50-510]'] # avg_nnz_per_row
ranges_anr = ['[0-10]', '[10-50]', '[50-100]', '[100-510]'] # avg_nnz_per_row

# ranges_skew = ['[0-100]', '[100-500]', '[500-2000]', '[2000-180000]'] # skew_coeff
ranges_skew = ['[0-1.5]', '[1.5-50]', '[50-250]', '[250-3000]', '[3000-10000]'] # skew_coeff

# ranges_ann = ['[0-0.6]', '[0.6-1.4]', '[1.4-2]'] # avg_num_neighbours
# ranges_crs = ['[0-0.3]', '[0.3-0.7]', '[0.7-1]'] # cross_row_similarity
ranges_ann = ['[0-1]', '[1-2]'] # avg_num_neighbours
ranges_crs = ['[0-0.5]', '[0.5-1]'] # cross_row_similarity

# ranges_size = ['S', 'M', 'L']
# ranges_regularity = ['SS', 'SM', 'SL', 'MS', 'MM', 'ML', 'LS', 'LM', 'LL']
ranges_size = ['S', 'L']
ranges_regularity = ['SS', 'SL', 'LS', 'LL']

cat_list = ['mem_footprint',
            'avg_nnz_per_row',
            'skew','avg_num_neighbours','cross_row_similarity']
ranges_list = [ranges_memr,
               ranges_anr,
               ranges_skew,
               ranges_ann, ranges_crs]


# Store variables and functions to be used in other jupyter notebooks too (using a Pickle/Dill object)

In [3]:
# Store the objects in a file
objects_to_save = {
    'ranges_dev': ranges_dev,
    'ranges_dev_backup': ranges_dev_backup,
    'y_limit_dictionary': y_limit_dictionary,
    'llc_thresholds': llc_thresholds,
    'ranges_impl_arch': ranges_impl_arch,
    'ranges_impl_arch_backup': ranges_impl_arch_backup,
    'ranges_memr': ranges_memr,
    'ranges_anr': ranges_anr,
    'ranges_skew': ranges_skew,
    'ranges_ann': ranges_ann,
    'ranges_crs': ranges_crs,
    'ranges_size': ranges_size,
    'ranges_regularity': ranges_regularity,
    'cat_list': cat_list,
    'ranges_list': ranges_list,
    
    # functions
    'set_category': set_category,
    'set_category2': set_category2,
    'filter_ranges_impl_arch': filter_ranges_impl_arch,
    'calculate_format_wins': calculate_format_wins,
}

with open('objects.pkl', 'wb') as f:
    pickle.dump(objects_to_save, f)


---
# Some variables and functions to be used when reading csv files

In [4]:
header_names = ['matrix_name','distribution','placement','seed',
                'nr_rows','nr_cols','nr_nzeros','density','mem_footprint','mem_range',
                'avg_nnz_per_row','std_nnz_per_row',
                'avg_bw','std_bw','avg_bw_scaled','std_bw_scaled',
                'avg_sc','std_sc','avg_sc_scaled','std_sc_scaled',
                'skew','avg_num_neighbours','cross_row_similarity',
                'format_name','time','gflops','W_avg','J_estimated', 'System', 'Arch']

# precision = 'f'
precision = 'd' # for double-precision arithmetic

prefix = '../benchmark_results/'
# If new format added here, need to specify in key:value with:
# key:   filename (without _d.csv) 
# value: how the format appears in file
impl_dict = {
    'csr_naive': 'Naive_CSR_CPU',
    'csr': 'Custom_CSR_B',
    'csr_vector_x86': 'Custom_CSR_BV_x86',
    'csr_vector_sve': 'Custom_CSR_BV_SVE',

    'mkl_ie': 'MKL_IE',
    'aocl_optmv': 'AOCL_OPTMV',
    'armpl': 'ARMPL',

    'csr5': 'CSR5',
    'merge': 'MERGE',
    'sell_C_s': 'SELL-32-1',
    # 'sell_C_s': 'SELL-32-512',
    'sparsex': 'SparseX',
    'lcm': 'LCM', 

    'csr_rocm_vector_b512_nv': 'Custom_CSR_ROCM_VECTOR_b512',
    'csr_rocm_adaptive_b512_mb1_nv': 'Custom_CSR_CUDA_ADAPTIVE_b512_1',
    'csr_rocm_const_nnz_per_thread_b512_nnz4_nv': 'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4',

    'csr_rocm_acc_flat_b512_nv': 'ACC_FLAT_b512',
    'csr_rocm_acc_line_enhance_b512_nv': 'ACC_LINE_ENHANCE_b512',

    'rocsparse_csr_nv': 'ROCSPARSE_CSR',
    'rocsparse_coo_nv': 'ROCSPARSE_COO',
    'rocsparse_hyb_nv': 'ROCSPARSE_HYB',

    'csr_cuda_vector_b256_nv': 'Custom_CSR_CUDA_VECTOR_b256',
    'csr_cuda_adaptive_b256_mb1_nv': 'Custom_CSR_CUDA_ADAPTIVE_b256_1',
    'csr_cuda_const_nnz_per_thread_b1024_nnz4_nv': 'Custom_CSR_CUDA_constant_nnz_per_thread_b1024_nnz4',

    'csr5_cuda_nv': 'CSR5_CUDA',
    'merge_cuda_nv': 'MERGE_CUDA',
    'dasp_cuda_nv': 'DASP_CUDA',

    'cusparse_csr_nv': 'CUSPARSE_CSR',
    'cusparse_coo_nv': 'CUSPARSE_COO',
}

def remove_formats(df, formats_to_discard):
    for ftd in formats_to_discard:
        df = df[df['format_name'] != ftd]
    return df

def print_formats_per_device(df):
    print('------------------------')
    print('Size of dataframe')
    print(df.shape)
    print('Tested formats')
    print(set(df['format_name']))
    print('Tested formats per device:')
    for sys in set(df['System']):
        df_sys = df[df['System'] == sys]
        print(sys, '\t', set(df_sys['format_name']))
    print('------------------------')


In [5]:
def fix_features(root, file):
    file_path = os.path.join(root, file)
    backup_file_path = file_path.replace('.csv', '_BAD_FEATURES.csv')
    
    if(not(os.path.isfile(backup_file_path))):
        print('need to do sth for', backup_file_path)
        if('friends' in file_path):
            with open('feats_friends.csv') as f:
                features = f.readlines()
        if('synthetic' in file_path):
            with open('feats_synthetic.csv') as f:
                features = f.readlines()
        for i in range(len(features)):
            features[i] = features[i].strip('\n')

        with open(file_path, 'r') as f:
            lines = f.readlines()
            if('sell_C_s' not in file_path):
                lines0 = lines[0]
                lines = lines[1:]

        impl  = file.replace('_'+precision+'.csv','')
        impl2 = impl_dict[impl]
        # print(impl, impl2)
        filtered_lines = []
        if('sell_C_s' not in file_path):
            filtered_lines.append(lines0)
        if(len(features) == len(lines)):
            for i,j in zip(features, lines):
                j_stripped = j.split(','+impl2)[1]
                new_line = i.strip('\n') + ',' + impl2 + j_stripped
                filtered_lines.append(new_line)
        else:
            # print(file_path, '\t', 'len(features)', len(features), '\t', 'len(lines)', len(lines))
            cnt = 0
            for j in lines:
                j_spl = j.split(',')
                j_spl_new = ','.join(j_spl[0:5])

                i = features[cnt]
                i_spl = i.split(',')
                i_spl_new = ','.join(i_spl[0:5])

                while(i_spl_new != j_spl_new):
                    cnt+=1
                    i = features[cnt]
                    i_spl = i.split(',')
                    i_spl_new = ','.join(i_spl[0:5])

                if(i_spl_new == j_spl_new):
                    j_stripped = j.split(','+impl2)[1]
                    new_line = i.strip('\n') + ',' + impl2 + j_stripped
                    filtered_lines.append(new_line)
                cnt+=1
                # print(j, cnt)

        # Backup the original file
        backup_file_path = file_path.replace('.csv', '_BAD_FEATURES.csv')
        os.rename(file_path, backup_file_path)
        # Write the filtered content back to the original file path
        with open(file_path, 'w') as f:
            f.writelines(filtered_lines)
        # print(f'Original file backed up as: {backup_file_path}')
        # print(f'Filtered file saved as: {file_path}\n')


In [6]:
def fix_synthetic_csv_errors(file_path, flag=0):
    # Read the entire file to count lines and occurrences of 'synthetic'
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Count lines excluding the header
    # flag=1 (sell-C-σ) does not have a header, flag=0 (all others) does have a header
    if(flag==0):
        num_lines = len(lines) - 1
    else:
        num_lines = len(lines)

    # Count occurrences of the word 'synthetic'
    count_synthetic = sum(line.lower().count('synthetic') for line in lines)

    # Check if counts differ
    if num_lines != count_synthetic:
        # Print counts
        print(f'Number of lines (excluding header): {num_lines}')
        print(f'Occurrences of "synthetic"        : {count_synthetic}')
        print(f'Difference in line count and "synthetic" occurrences for file: {file_path}\n')

        # Backup the original file
        backup_file_path = file_path.replace('.csv', '_BAD.csv')
        os.rename(file_path, backup_file_path)
        print(f'Original file backed up as: {backup_file_path}')

        # Filter out lines that do not contain 'synthetic'
        if(flag==0):
            filtered_lines = [lines[0]] + [line for line in lines[1:] if 'synthetic' in line.lower()]
        else:
            filtered_lines = [line for line in lines if 'synthetic' in line.lower()]

        # Write the filtered content back to the original file path
        with open(file_path, 'w') as f:
            f.writelines(filtered_lines)
        print(f'Filtered file saved as: {file_path}\n')


In [7]:
def read_synthetic(root, file):
    file_path = os.path.join(root, file)
    # print(f'File: {file_path}')
    try:
        if ('sell_C_s_d' in file):
            fix_synthetic_csv_errors(file_path, 1) # 1 for flag
            # We need to do the following because SELL-C-σ runs on different from other matrices benchmark.
            # This way, we make sure that when merging all results according to matrix features, 
            # all different features will be considered 
            fix_features(root, file)
            SELL_C_S_D_HEADER = header_names[:-2]            
            df = pd.read_csv(file_path, names=SELL_C_S_D_HEADER, header=0)
        else:
            fix_synthetic_csv_errors(file_path)
            fix_features(root, file)
            df = pd.read_csv(file_path)
    except Exception as e:
        print(f'1) Error reading {file_path}: {e}\n')
    return df


In [8]:
def read_device_data_old(directory, System, Arch, header_names, synthetic_flag, threads = 0):
    if(synthetic_flag==1):
        if(Arch == 'CPU'):
            # it is CPU data that we want to read, have to append number of threads too
            df = pd.read_csv('../benchmark_results/' + directory + '/' + directory + '_synthetic_t%d_%s.csv' % (threads, precision), names = header_names)
        else: 
            # it is GPU or FPGA data that we want to read
            # df = pd.read_csv('../benchmark_results/' + directory + '/' + directory + '_synthetic_%s.csv' % (precision), names = header_names)
            df = pd.read_csv('../benchmark_results/'+ directory + '/' + directory + '_dtype-D_run_full_dataset.csv', names = header_names)
    else:
        if(Arch == 'GPU'):
            df = pd.read_csv('../benchmark_results/' + directory + '/' + directory + '_dtype-D_run_friend_dataset.csv', names = header_names)
        elif(Arch == 'CPU'):
            df = pd.read_csv('../benchmark_results/' + directory + '/' + directory + '_friends_10_samples_30_range_t%d_%s.csv' % (threads, precision), names = header_names)

    df['System'] = System
    print('Finished reading ', System)
    return df

def find_csv_files(directory, system_name, synthetic_flag):
    if(synthetic_flag == 1):
        keyword = 'synthetic'
    else:
        keyword = 'friends'
    df_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if ('DELETE' not in root) and ('rep' in root) and ('BAD' not in file):
                if file.endswith('.csv'):
                    file_path = os.path.join(root, file)
                    if (keyword in root):
                        # print(root,file)
                        curr_df = read_synthetic(root, file)
                        df_list.append(curr_df)
    df = pd.concat(df_list, ignore_index=True)
    df['System'] = system_name
    print('Finished reading ', system_name)
    return df


---
# Read GPU data

In [9]:
def read_gpu_data(header_names, synthetic_flag):
    start_time = time.time()
    
    inputdata_GPU_P100 = read_device_data_old('vulcan-P100', 'NVIDIA-P100', 'GPU', header_names, synthetic_flag)
    inputdata_GPU_V100 = read_device_data_old('vulcan-V100', 'NVIDIA-V100', 'GPU', header_names, synthetic_flag)
    inputdata_GPU_A100 = read_device_data_old('epyc5-A100', 'NVIDIA-A100', 'GPU', header_names, synthetic_flag)
    # perhaps it will be needed
    inputdata_GPU_A100 = inputdata_GPU_A100[inputdata_GPU_A100['gflops']<800] # panastaaaaaaas (some Merge shit is going on here...)
    
    inputdata_GPU_H100  = find_csv_files(prefix + 'grace1-H100', 'NVIDIA-H100', synthetic_flag)
    inputdata_GPU_MI250 = find_csv_files(prefix + 'amd-mi250', 'AMD-MI250', synthetic_flag)
    # Only for 'friends' dataset this has to be applied..
    if(synthetic_flag == 0):    
        inputdata_GPU_MI250 = inputdata_GPU_MI250[inputdata_GPU_MI250['gflops'] < 250]

    inputdata_GPU = pd.concat([inputdata_GPU_P100, inputdata_GPU_V100, inputdata_GPU_A100, inputdata_GPU_H100, inputdata_GPU_MI250])
    inputdata_GPU['Arch'] = 'GPU'

    formats_to_discard = []
    inputdata_GPU = remove_formats(inputdata_GPU, formats_to_discard)

    print_formats_per_device(inputdata_GPU)
    
    # inputdata_GPU = inputdata_GPU.round({'nz': -1, 'mem_footprint': 3, 'density': 3,
    #              'avg_nnz_row': 3, 'std_nnz_row': 3,
    #              'avg_bandwidth': -1, 'std_bandwidth': -1, 'avg_bandwidth_scaled': 2, 'std_bandwidth_scaled': 2,
    #              'avg_scattering': 2, 'std_scattering': 2, 'avg_scattering_scaled': -1, 'std_scattering_scaled': -1,
    #              'skew_coeff': 1,
    #              'avg_num_neighbours' : 3, 'cross_row_similarity': 3})
    elapsed_time = time.time() - start_time
    print(f'Time spent in read_gpu_data: {elapsed_time:.2f} seconds\n')
    return inputdata_GPU


---
# Read CPU data

In [10]:
def read_cpu_data(header_names, synthetic_flag):
    start_time = time.time()
    
    Hawk_threads     = 64
    Epyc_threads     = 24
    Xeon_threads     = 14
    Icy_threads      = 16
    Sapphire_threads = 56
    Arm_threads      = 80
    Grace_threads    = 72
    Power9_threads   = 32

    inputdata_CPU_AMD_EPYC1       = read_device_data_old('amd-epyc1', 'AMD-EPYC-24', 'CPU', header_names, synthetic_flag, threads = Epyc_threads)
    inputdata_CPU_AMD_EPYC_64     = find_csv_files(prefix + 'amd-epyc7763', 'AMD-EPYC-64', synthetic_flag)

    inputdata_CPU_INTEL_GOLD2     = read_device_data_old('intel-gold2', 'INTEL-XEON-14', 'CPU', header_names, synthetic_flag, threads = Xeon_threads)
    inputdata_CPU_INTEL_ICY3      = read_device_data_old('intel-icy3', 'INTEL-ICY-16', 'CPU', header_names, synthetic_flag, threads = Icy_threads)
    inputdata_CPU_INTEL_SAPPHIRE  = find_csv_files(prefix + 'intel-sapphire', 'INTEL-SAPPHIRE-56', synthetic_flag)
    # only for this, please discard AMD AOCL format for Intel CPU!
    inputdata_CPU_INTEL_SAPPHIRE  = inputdata_CPU_INTEL_SAPPHIRE[inputdata_CPU_INTEL_SAPPHIRE['format_name']!='AOCL_OPTMV']
    inputdata_CPU_ARM_NEON        = read_device_data_old('arm', 'ARM-NEON-80',    'CPU', header_names, synthetic_flag, threads = Arm_threads)
    inputdata_CPU_ARM_GRACE       = find_csv_files(prefix + 'grace1-arm', 'ARM-GRACE-72', synthetic_flag)

    inputdata_CPU_IBM_POWER9      = read_device_data_old('power9-m100', 'IBM-POWER9-32', 'CPU', header_names, synthetic_flag, threads = Power9_threads)

    # fix some things...
    inputdata_CPU_AMD_EPYC1.astype({'avg_bw': 'float64'})
    inputdata_CPU_IBM_POWER9['W_avg'] = 200.1 # We could not measure power consumption from IBM Power9

    inputdata_CPU = pd.concat([inputdata_CPU_AMD_EPYC_64, inputdata_CPU_AMD_EPYC1,
                               inputdata_CPU_INTEL_GOLD2, inputdata_CPU_INTEL_ICY3, inputdata_CPU_INTEL_SAPPHIRE,
                               inputdata_CPU_ARM_NEON, inputdata_CPU_ARM_GRACE,
                               inputdata_CPU_IBM_POWER9])
    inputdata_CPU['Arch'] = 'CPU'

    # formats to discard here!
    formats_to_discard = ['Custom_CSR_PBV_x86', 'Custom_CSR_PBV']
    inputdata_CPU = remove_formats(inputdata_CPU, formats_to_discard)

    print_formats_per_device(inputdata_CPU)

    # inputdata_CPU = inputdata_CPU.round({'nr_nzeros': -1, 'mem_footprint': 3, 'density': 3,
    #              'avg_nnz_row': 3, 'std_nnz_row': 3,
    #              'avg_bw': -1, 'std_bw': -1, 'avg_bw_scaled': 2, 'std_bw_scaled': 2,
    #              'avg_sc': 2, 'std_sc': 2, 'avg_sc_scaled': -1, 'std_sc_scaled': -1,
    #              'skew': 1,
    #              'avg_num_neighbours' : 3, 'cross_row_similarity': 3})

    elapsed_time = time.time() - start_time
    print(f'Time spent in read_cpu_data: {elapsed_time:.2f} seconds\n')
    return inputdata_CPU


---
# Read FPGA data (skip this for now...)

In [11]:
def read_fpga_data(header_names, synthetic_flag):
    start_time = time.time()
    
    # Will use this when data from FPGA regarding Vitis Library have been cleaned up
    # inputdata_FPGA_ALVEO_U280 = read_device_data('alveo-u280', 'Alveo-U280', 'FPGA')
    if(synthetic_flag==1):
        inputdata_FPGA_ALVEO_U280 = pd.read_csv('../benchmark_results/alveo-u280/PADDED-alveo-u280_spmv_4-2048_dtype-D.csv', names = header_names)
    else:
        fname = 'alveo-u280_spmv_validation_matrices_10_samples_30_range_twins_dtype-D.csv'
        inputtdata_FPGA = pd.read_csv('../benchmark_results/alveo-u280/%s' % fname, names = header_names)
    inputdata_FPGA_ALVEO_U280['System'] = 'Alveo-U280'

    inputdata_FPGA = pd.concat([inputdata_FPGA_ALVEO_U280])
    inputdata_FPGA['Arch'] = 'FPGA'
    print_formats_per_device(inputdata_FPGA)
    
    elapsed_time = time.time() - start_time
    print(f'Time spent in read_fpga_data: {elapsed_time:.2f} seconds\n')
    return inputdata_FPGA


---
# Concatenate all data

In [12]:
def concatenate_inputdata(inputdata_GPU, inputdata_CPU, inputdata_FPGA = []):
    start_time = time.time()
    
    # Merge the results
    # inputdata = pd.concat([inputdata_GPU,inputdata_CPU,inputdata_FPGA])
    inputdata = pd.concat([inputdata_GPU,inputdata_CPU])
    print('Concatenated data of all devices:', inputdata.shape)

    # Keep mean of benchmarks, store in 'groupreps', take mean
    # groupreps = inputdata.groupby(['matrix_name','distribution','placement','seed',
    #                                'nr_rows','nr_cols','nr_nzeros','density','mem_footprint','mem_range',
    #                                'avg_nnz_per_row','std_nnz_per_row',
    #                                'avg_bw','std_bw','avg_bw_scaled','std_bw_scaled',
    #                                'avg_sc','std_sc','avg_sc_scaled','std_sc_scaled',
    #                                'skew','avg_num_neighbours','cross_row_similarity',
    #                                'format_name','System', 'Arch']).mean().reset_index().reindex(columns=header_names)

    # These 4 columns are the ones that we want the new dataframe to be averaged (different measurements collected for the same matrix)
    group_by_header = [x for x in header_names if x not in  ['time','gflops','W_avg','J_estimated']]
    groupreps = inputdata.groupby(group_by_header, observed = True).mean().reset_index().reindex(columns=header_names)
    print('Concatenated data of all devices (average):', groupreps.shape)
    
    # Rename formats to human-readable form...
    format_name_mapping = {
        'cuSPARSE_csr11': 'cu-CSR',
        'cuSPARSE_coo11': 'cu-COO',
        'cuSPARSE_hyb9-2': 'cu-HYB',
        'CUSPARSE_CSR': 'cu-CSR',
        'CUSPARSE_COO': 'cu-COO',

        'ROCSPARSE_CSR': 'roc-CSR',
        'ROCSPARSE_COO': 'roc-COO',
        'ROCSPARSE_HYB': 'roc-HYB',

        'Merge_11': 'Merge-CSR',
        'MERGE_CUDA': 'Merge-CSR',
        'MERGE': 'Merge-CSR',

        'Custom_CSR_CUDA_ADAPTIVE_b512_1': 'Ada-CSR',
        'Custom_CSR_CUDA_ADAPTIVE_b256_1': 'Ada-CSR',
        'Custom_CSR_ROCM_ADAPTIVE_b512_1': 'Ada-CSR',

        # 'Custom_CSR_CUDA_VECTOR_b512': 'Vec-CSR',
        'Custom_CSR_CUDA_VECTOR_b256': 'Vec-CSR',
        'Custom_CSR_ROCM_VECTOR_b512': 'Vec-CSR',

        'Custom_CSR_CUDA_constant_nnz_per_thread_b1024_nnz4': 'Custom-CSR',
        'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4': 'Custom-CSR',

        'ACC_LINE_ENHANCE_b512': 'ACC-Line',
        'ACC_FLAT_b512': 'ACC-Flat',

        'SELL-32-1': 'SELL-C-σ',
        # 'SELL-32-512': 'SELL-C-σ',

        'Naive_CSR_CPU': 'Naive-CSR',
        'Custom_CSR_B': 'Bal-CSR',
        'Custom_CSR_BV_x86': 'Vec-CSR',
        'Custom_CSR_BV_SVE': 'Vec-CSR',

        'MKL_IE': 'MKL-IE',
        'AOCL_OPTMV': 'AOCL',
        'ARMPL': 'ARM-lib',

        'SparseX': 'SparseX',
        'CSR5_9': 'CSR5',
        'CSR5_CUDA': 'CSR5',
        'CSR5': 'CSR5',
        'DASP_CUDA': 'DASP',
        'LCM': 'LCM',
    }
    groupreps['format_name'] = groupreps['format_name'].replace(format_name_mapping)
    
    elapsed_time = time.time() - start_time
    print(f'Time spent in concatenate_inputdata: {elapsed_time:.2f} seconds\n')
    return groupreps


---
# Group by "best-of" format_name for each device
# Skip this step if you want to plot every measurement collected

In [13]:
def group_by_best_of(groupreps, header_names):
    start_time = time.time()
    
    # Group per system, take best (it was over 'inputdata', but 'groupreps' is better choice I think)
    # fixed this after reordering groupreps columns according to header_names
    # before this, columns and data were mixed and it was a complete shitstorm
    # group_system = groupreps.groupby(['matrix_name','distribution','placement','seed',
    #                                   'nr_rows','nr_cols','nr_nzeros','density','mem_footprint','mem_range',
    #                                   'avg_nnz_per_row','std_nnz_per_row',
    #                                   'avg_bw','std_bw','avg_bw_scaled','std_bw_scaled',
    #                                   'avg_sc','std_sc','avg_sc_scaled','std_sc_scaled',
    #                                   'skew','avg_num_neighbours','cross_row_similarity',
    #                                   'System','Arch'], as_index = False)
    group_by_header = [x for x in header_names if x not in  ['format_name', 'time','gflops','W_avg','J_estimated']]
    group_system = groupreps.groupby(group_by_header, as_index = False, observed = True)

    reslist = []
    for desc, experiment in group_system:
        best_format = experiment['format_name'].iloc[experiment['gflops'].argmax()]
        outrow = experiment[experiment['format_name'] == best_format]
        # if(len(outrow)>1):
        #     print(len(outrow), outrow)
        reslist.append(outrow.values.tolist()[0])             

    group_system_best = pd.DataFrame(reslist, columns = header_names)
    print('Concatenated data of all devices (best performing format for each matrix):', group_system_best.shape)
    elapsed_time = time.time() - start_time
    print(f'Time spent in group_by_best_of: {elapsed_time:.2f} seconds\n')
    return group_system_best


# Also try the group by "top-K" format_names approach

In [14]:
def group_by_topK_best(groupreps, header_names, k=1):
    start_time = time.time()
    
    # Group per system, take best (it was over 'inputdata', but 'groupreps' is better choice I think)
    # fixed this after reordering groupreps columns according to header_names
    # before this, columns and data were mixed and it was a complete shitstorm
    # group_system = groupreps.groupby(['matrix_name','distribution','placement','seed',
    #                                   'nr_rows','nr_cols','nr_nzeros','density','mem_footprint','mem_range',
    #                                   'avg_nnz_per_row','std_nnz_per_row',
    #                                   'avg_bw','std_bw','avg_bw_scaled','std_bw_scaled',
    #                                   'avg_sc','std_sc','avg_sc_scaled','std_sc_scaled',
    #                                   'skew','avg_num_neighbours','cross_row_similarity',
    #                                   'System','Arch'], as_index = False)
    group_by_header = [x for x in header_names if x not in  ['format_name', 'time','gflops','W_avg','J_estimated']]
    group_system = groupreps.groupby(group_by_header, as_index = False, observed = True)

    reslist = []
    for desc, experiment in group_system:
        # best_format = experiment['format_name'].iloc[experiment['gflops'].argmax()]
        # outrow = experiment[experiment['format_name'] == best_format]
        # reslist.append(outrow.values.tolist()[0])
        topK_formats = experiment.nlargest(k, 'gflops')
        reslist.append(topK_formats)

    group_system_best = pd.concat(reslist).reset_index(drop=True)
    print('Concatenated data of all devices (Top', k, 'performing format for each matrix):', group_system_best.shape)
    elapsed_time = time.time() - start_time
    print(f'Time spent in group_by_best_of: {elapsed_time:.2f} seconds\n')
    return group_system_best


---
# Add some extra columns to dataframes "groupreps" and "group_system_best"

In [15]:
def add_extra_columns(df, ranges_crs, ranges_ann, ranges_size, ranges_anr, ranges_skew):
    df = df[df['W_avg']>=0]

    # this has to be to discard some anomalies from AMD-HAWK measurements
    # df1 = df[df['System'] != 'AMD-EPYC-64']
    # df2 = df[df['System'] == 'AMD-EPYC-64'] 
    # df2 = df2[df2['W_avg']>30]
    # df = pd.concat([df1, df2])
    # df = df[df['W_avg']>32.9]

    df['impl_arch'] = '( ' + df['System'] + ' ' ')\t' + df['format_name']
    df['energy_efficiency'] = df['gflops'] / df['W_avg'] 
    df['GFLOPs^2-per-W'] = df['gflops'] * df['gflops'] / df['W_avg']

    df['crs_categ'] = df.apply(lambda row: set_category(row['cross_row_similarity'], ranges_crs, ranges_size, ranges_size_flag=True), axis=1)
    df['ann_categ'] = df.apply(lambda row: set_category(row['avg_num_neighbours'], ranges_ann, ranges_size, ranges_size_flag=True), axis=1)
    df['regularity'] = df['crs_categ'] + df['ann_categ']

    df['anr_categ'] = df.apply(lambda row: set_category(row['avg_nnz_per_row'], ranges_anr, [], ranges_size_flag=False), axis=1)
    df['skew_categ'] = df.apply(lambda row: set_category(row['skew'], ranges_skew, [], ranges_size_flag=False), axis=1)

    return df

def insert_new_info(groupreps, group_system_best):
    start_time = time.time()
    
    groupreps = add_extra_columns(groupreps, ranges_crs, ranges_ann, ranges_size, ranges_anr, ranges_skew)
    group_system_best = add_extra_columns(group_system_best, ranges_crs, ranges_ann, ranges_size, ranges_anr, ranges_skew)

    # this will be used sometime in the future...
    extra_header_names = ['impl_arch', 'energy_efficiency', 'GFLOPs^2-per-W', 
                          'crs_categ', 'ann_categ', 'regularity', 'anr_categ', 'skew_categ']
    groupreps = groupreps.replace([np.inf, -np.inf], np.nan)
    group_system_best = group_system_best.replace([np.inf, -np.inf], np.nan)
    
    elapsed_time = time.time() - start_time
    print(f'Time spent in insert_new_info: {elapsed_time:.2f} seconds\n')
    return groupreps, group_system_best


# Read benchmarks regarding synthetic matrices (15K matrix dataset)

In [16]:
%%time
synthetic_flag = 1
# inputdata_FPGA = read_fpga_data(header_names, synthetic_flag)
inputdata_GPU = read_gpu_data(header_names, synthetic_flag)
inputdata_CPU = read_cpu_data(header_names, synthetic_flag)

# Concatenate all data
groupreps = concatenate_inputdata(inputdata_GPU, inputdata_CPU)

# Group by 'best-of' format_name for each device
group_system_best = group_by_best_of(groupreps, header_names)

# Add some extra columns to dataframes 'groupreps' and 'group_system_best'
groupreps, group_system_best = insert_new_info(groupreps, group_system_best)

# Store in 2 csv files. 1 for 'all' benchmarks, 1 for 'best-of' benchmarks
groupreps.to_csv('synthetic_benchmarks_all-devices_all.csv', sep=',', header=True, index=False)
group_system_best.to_csv('synthetic_benchmarks_all-devices_best-of.csv', sep=',', header=True, index=False)

Finished reading  NVIDIA-P100
Finished reading  NVIDIA-V100
Finished reading  NVIDIA-A100


  df = pd.concat(df_list, ignore_index=True)


Finished reading  NVIDIA-H100
Finished reading  AMD-MI250
------------------------
Size of dataframe
(1283609, 30)
Tested formats
{'cuSPARSE_csr11', 'cuSPARSE_coo11', 'CSR5_CUDA', 'Custom_CSR_CUDA_ADAPTIVE_b512_1', 'ACC_FLAT_b512', 'Custom_CSR_CUDA_VECTOR_b256', 'CUSPARSE_COO', 'CUSPARSE_CSR', 'ROCSPARSE_COO', 'ROCSPARSE_CSR', 'Custom_CSR_CUDA_constant_nnz_per_thread_b1024_nnz4', 'Custom_CSR_CUDA_ADAPTIVE_b256_1', 'Custom_CSR_ROCM_VECTOR_b512', 'cuSPARSE_hyb9-2', 'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4', 'DASP_CUDA', 'MERGE_CUDA', 'SELL-32-1', 'ACC_LINE_ENHANCE_b512', 'ROCSPARSE_HYB', 'CSR5_9', 'Merge_11'}
Tested formats per device:
AMD-MI250 	 {'ACC_LINE_ENHANCE_b512', 'ROCSPARSE_COO', 'ROCSPARSE_CSR', 'ROCSPARSE_HYB', 'Custom_CSR_CUDA_ADAPTIVE_b512_1', 'Custom_CSR_ROCM_VECTOR_b512', 'ACC_FLAT_b512', 'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4'}
NVIDIA-P100 	 {'CSR5_9', 'cuSPARSE_hyb9-2', 'cuSPARSE_csr11', 'cuSPARSE_coo11'}
NVIDIA-A100 	 {'SELL-32-1', 'cuSPARSE_csr11

# Read benchmarks regarding validation-friends matrices (3K matrix dataset)

In [17]:
%%time
synthetic_flag = 0
# inputdata_FPGA = read_fpga_data(header_names, synthetic_flag)
inputdata_GPU = read_gpu_data(header_names, synthetic_flag)
inputdata_CPU = read_cpu_data(header_names, synthetic_flag)

# Concatenate all data
groupreps = concatenate_inputdata(inputdata_GPU, inputdata_CPU)

# Group by 'best-of' format_name for each device
group_system_best = group_by_best_of(groupreps, header_names)
# group_system_top2 = group_by_topK_best(groupreps, header_names, k=2)
# group_system_top3 = group_by_topK_best(groupreps, header_names, k=3)
# group_system_top4 = group_by_topK_best(groupreps, header_names, k=4)
# group_system_top5 = group_by_topK_best(groupreps, header_names, k=5)

# NO NEED TO DO THIS FOR VALIDATION-FRIENDS
# Add some extra columns to dataframes 'groupreps' and 'group_system_best'
# groupreps, group_system_best = insert_new_info(groupreps, group_system_best)

# Store in 2 csv files. 1 for 'all' benchmarks, 1 for 'best-of' benchmarks
groupreps.to_csv('validation_friends_benchmarks_all-devices_all.csv', sep=',', header=True, index=False)
group_system_best.to_csv('validation_friends_benchmarks_all-devices_best-of.csv', sep=',', header=True, index=False)
# group_system_top2.to_csv('validation_friends_benchmarks_all-devices_top2.csv', sep=',', header=True, index=False)
# group_system_top3.to_csv('validation_friends_benchmarks_all-devices_top3.csv', sep=',', header=True, index=False)
# group_system_top4.to_csv('validation_friends_benchmarks_all-devices_top4.csv', sep=',', header=True, index=False)
# group_system_top5.to_csv('validation_friends_benchmarks_all-devices_top5.csv', sep=',', header=True, index=False)


Finished reading  NVIDIA-P100
Finished reading  NVIDIA-V100
Finished reading  NVIDIA-A100
Finished reading  NVIDIA-H100
Finished reading  AMD-MI250
------------------------
Size of dataframe
(337309, 30)
Tested formats
{'cuSPARSE_csr11', 'cuSPARSE_coo11', 'CSR5_CUDA', 'Custom_CSR_CUDA_ADAPTIVE_b512_1', 'ACC_FLAT_b512', 'Custom_CSR_CUDA_VECTOR_b256', 'CUSPARSE_COO', 'CUSPARSE_CSR', 'ROCSPARSE_COO', 'ROCSPARSE_CSR', 'Custom_CSR_CUDA_ADAPTIVE_b256_1', 'Custom_CSR_CUDA_constant_nnz_per_thread_b1024_nnz4', 'Custom_CSR_ROCM_VECTOR_b512', 'cuSPARSE_hyb9-2', 'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4', 'DASP_CUDA', 'MERGE_CUDA', 'SELL-32-1', 'ACC_LINE_ENHANCE_b512', 'ROCSPARSE_HYB', 'CSR5_9', 'Merge_11'}
Tested formats per device:
AMD-MI250 	 {'ACC_LINE_ENHANCE_b512', 'ROCSPARSE_COO', 'ROCSPARSE_CSR', 'ROCSPARSE_HYB', 'Custom_CSR_CUDA_ADAPTIVE_b512_1', 'Custom_CSR_ROCM_VECTOR_b512', 'ACC_FLAT_b512', 'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4'}
NVIDIA-P100 	 {'CSR5_9', 'cuSPARSE

In [18]:
# %%time
# synthetic_flag = 0
# # inputdata_FPGA = read_fpga_data(header_names, synthetic_flag)
# inputdata_GPU = read_gpu_data(header_names, synthetic_flag)
# inputdata_GPU = inputdata_GPU[inputdata_GPU['gflops']<1000]
# inputdata_CPU = read_cpu_data(header_names, synthetic_flag)

# # Concatenate all data
# groupreps = concatenate_inputdata(inputdata_GPU, inputdata_CPU)

# # Group by 'best-of' format_name for each device
# group_system_best = group_by_best_of(groupreps, header_names)

# # NO NEED TO DO THIS FOR VALIDATION-FRIENDS
# # Add some extra columns to dataframes 'groupreps' and 'group_system_best'
# groupreps, group_system_best = insert_new_info(groupreps, group_system_best)

# # Store in 2 csv files. 1 for 'all' benchmarks, 1 for 'best-of' benchmarks
# groupreps.to_csv('validation_friends_benchmarks_all-devices_all2.csv', sep=',', header=True, index=False)
# group_system_best.to_csv('validation_friends_benchmarks_all-devices_best-of2.csv', sep=',', header=True, index=False)
