In [1]:
import os
import pandas as pd

In [2]:
# If new format added here, need to specify in key:value with:
# key:   filename (without _d.csv) 
# value: how the format appears in file
impl_dict = {
    'csr_naive': 'Naive_CSR_CPU',
    'csr': 'Custom_CSR_B',
    'csr_vector_x86': 'Custom_CSR_BV_x86',
    'csr_vector_sve': 'Custom_CSR_BV_SVE',

    'mkl_ie': 'MKL_IE',
    'aocl_optmv': 'AOCL_OPTMV',
    'armpl': 'ARMPL',

    'csr5': 'CSR5',
    'merge': 'MERGE',
    'sell_C_s': 'SELL-32-1',
    'sparsex': 'SparseX',

    'csr_rocm_vector_b512_nv': 'Custom_CSR_ROCM_VECTOR_b512',
    'csr_rocm_adaptive_b512_mb1_nv': 'Custom_CSR_CUDA_ADAPTIVE_b512_1',
    'csr_rocm_const_nnz_per_thread_b512_nnz4_nv': 'Custom_CSR_ROCM_constant_nnz_per_thread_b512_nnz4',

    'csr_rocm_acc_flat_b512_nv': 'ACC_FLAT_b512',
    'csr_rocm_acc_line_enhance_b512_nv': 'ACC_LINE_ENHANCE_b512',

    'rocsparse_csr_nv': 'ROCSPARSE_CSR',
    'rocsparse_coo_nv': 'ROCSPARSE_COO',
    'rocsparse_hyb_nv': 'ROCSPARSE_HYB',

    'csr_cuda_vector_b256_nv': 'Custom_CSR_CUDA_VECTOR_b256',
    'csr_cuda_adaptive_b256_mb1_nv': 'Custom_CSR_CUDA_ADAPTIVE_b256_1',
    'csr_cuda_const_nnz_per_thread_b1024_nnz4_nv': 'Custom_CSR_CUDA_constant_nnz_per_thread_b1024_nnz4',

    'csr5_cuda_nv': 'CSR5_CUDA',
    'merge_cuda_nv': 'MERGE_CUDA',
    'dasp_cuda_nv': 'DASP_CUDA',

    'cusparse_csr_nv': 'CUSPARSE_CSR',
    'cusparse_coo_nv': 'CUSPARSE_COO',
}

precision = "d"
prefix = "/home/pmpakos/A/"

In [24]:
#################################
def fix_synthetic_csv_errors(file_path, flag=0):
    # Read the entire file to count lines and occurrences of "synthetic"
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Count lines excluding the header
    # flag=1 (sell-C-s) does not have a header, flag=0 (all others) does have a header
    if(flag==0):
        num_lines = len(lines) - 1
    else:
        num_lines = len(lines)

    # Count occurrences of the word "synthetic"
    count_synthetic = sum(line.lower().count("synthetic") for line in lines)

    # Check if counts differ
    if num_lines != count_synthetic:
        # Print counts
        print(f"Number of lines (excluding header): {num_lines}")
        print(f"Occurrences of 'synthetic'        : {count_synthetic}")
        print(f"Difference in line count and 'synthetic' occurrences for file: {file_path}\n")

        # Backup the original file
        backup_file_path = file_path.replace('.csv', '_BAD.csv')
        os.rename(file_path, backup_file_path)
        print(f"Original file backed up as: {backup_file_path}")

        # Filter out lines that do not contain "synthetic"
        if(flag==0):
            filtered_lines = [lines[0]] + [line for line in lines[1:] if "synthetic" in line.lower()]
        else:
            filtered_lines = [line for line in lines if "synthetic" in line.lower()]

        # Write the filtered content back to the original file path
        with open(file_path, 'w') as f:
            f.writelines(filtered_lines)
        print(f"Filtered file saved as: {file_path}\n")

def read_synthetic(root, file):
    file_path = os.path.join(root, file)
    # print(f"File: {file_path}")
    try:
        if ("sell_C_s_d" in file):
            fix_synthetic_csv_errors(file_path, 1) # 1 for flag
            # We need to do the following because SELL-C-s runs on different from other matrices benchmark.
            # This way, we make sure that when merging all results according to matrix features, 
            # all different features will be considered 
            fix_features(root, file)
            SELL_C_S_D_HEADER = [
                "matrix_name", "distribution", "placement", "seed", "nr_rows", "nr_cols", 
                "nr_nzeros", "density", "mem_footprint", "mem_range", "avg_nnz_per_row", 
                "std_nnz_per_row", "avg_bw", "std_bw", "avg_bw_scaled", "std_bw_scaled", 
                "avg_sc", "std_sc", "avg_sc_scaled", "std_sc_scaled", "skew", 
                "avg_num_neighbours", "cross_row_similarity", "format_name", "time", 
                "gflops", "W_avg", "J_estimated"]
            df = pd.read_csv(file_path, names=SELL_C_S_D_HEADER, header=0)
        else:
            fix_synthetic_csv_errors(file_path)
            fix_features(root, file)
            df = pd.read_csv(file_path)
    except Exception as e:
        print(f"1) Error reading {file_path}: {e}\n")
    return df


In [5]:
#################################
def fix_validation_csv_errors(file_path, flag=0):
    # Read the entire file to count lines and occurrences of "synthetic"
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Count lines excluding the header
    # flag=1 (sell-C-s) does not have a header, flag=0 (all others) does have a header
    if(flag==0):
        num_lines = len(lines) - 1
    else:
        num_lines = len(lines)

    # Count occurrences of the word "synthetic"
    count_mtx = sum(line.lower().count("mtx") for line in lines)

    # Check if counts differ
    if num_lines != count_mtx:
        # Print counts
        print(f"Number of lines (excluding header): {num_lines}")
        print(f"Occurrences of 'mtx'              : {count_mtx}")
        print(f"Difference in line count and 'mtx' occurrences for file: {file_path}\n")

        # Backup the original file
        backup_file_path = file_path.replace('.csv', '_BAD.csv')
        os.rename(file_path, backup_file_path)
        print(f"Original file backed up as: {backup_file_path}")

        # Filter out lines that do not contain "synthetic"
        if(flag==0):
            filtered_lines = [lines[0]] + [line for line in lines[1:] if "mtx" in line.lower()]
        else:
            filtered_lines = [line for line in lines if "mtx" in line.lower()]

        # Write the filtered content back to the original file path
        with open(file_path, 'w') as f:
            f.writelines(filtered_lines)
        print(f"Filtered file saved as: {file_path}\n")


def read_validation(root, file):
    file_path = os.path.join(root, file)
    # print(f"File: {file_path}")
    try:
        SELL_C_S_D_HEADER = [
            "matrix_name", "num_threads", "csr_m", "csr_n", "csr_nnz", "time",
            "gflops", "csr_mem_footprint", "W_avg", "J_estimated", "format_name", 
            "m", "n", "nnz", "mem_footprint"]
        if ("sell_C_s_d" in file):
            fix_validation_csv_errors(file_path, 1) # 1 for flag
            df = pd.read_csv(file_path, names=SELL_C_S_D_HEADER, header=0)
        else:
            fix_validation_csv_errors(file_path)
            df = pd.read_csv(file_path)
    except Exception as e:
        print(f"2) Error reading {file_path}: {e}\n")    
    return df


In [6]:
#################################
def find_csv_files(directory):
    val_df_list = []
    friends_df_list = []
    synthetic_df_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if ("DELETE" not in root) and ("rep" in root) and ("BAD" not in file):
                if file.endswith('.csv'):
                    file_path = os.path.join(root, file)
                    if ("validation" in root):
                        curr_df = read_validation(root, file)
                        # print("val_df_list:", file_path)
                        val_df_list.append(curr_df)          
                    else:
                        curr_df = read_synthetic(root, file)
                        if("friends" in root):
                            # print("friends_df_list:", file_path)
                            friends_df_list.append(curr_df)
                        if("synthetic" in root):
                            # print("synthetic_df_list:", file_path)
                            synthetic_df_list.append(curr_df)
    friends_df = pd.concat(friends_df_list, ignore_index=True)
    synthetic_df = pd.concat(synthetic_df_list, ignore_index=True)
    print("friends:   ", friends_df.shape)
    print("synthetic: ", synthetic_df.shape)
    return (friends_df, synthetic_df)


---

In [7]:
%%time 
# testbed = "amd-epyc7763"
# threads = 64

# friends_df, synthetic_df = find_csv_files(prefix + testbed + "/")

# output_csv_file = prefix + testbed + "/" + testbed + "_" + "friends_10_samples_30_range" + "_t" + str(threads) + "_" + precision + ".csv"
# friends_df.to_csv(output_csv_file, index=False, header=False, sep=',')
# output_csv_file = prefix + testbed + "/" + testbed + "_" + "synthetic" + "_t" + str(threads) + "_" + precision + ".csv"
# synthetic_df.to_csv(output_csv_file, index=False, header=False, sep=',')


CPU times: user 9 µs, sys: 2 µs, total: 11 µs
Wall time: 20.7 µs


In [8]:
%%time 
# testbed = "amd-mi250"

# friends_df, synthetic_df = find_csv_files(prefix + testbed + "/")

# output_csv_file = prefix + testbed + "/" + testbed + "_" + "friends_10_samples_30_range" + "_" + precision + ".csv"
# friends_df.to_csv(output_csv_file, index=False, sep=',')
# output_csv_file = prefix + testbed + "/" + testbed + "_" + "synthetic" + "_" + precision + ".csv"
# synthetic_df.to_csv(output_csv_file, index=False, sep=',')


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.3 µs


In [9]:
%%time 
# testbed = "intel-sapphire"
# threads = 56

# friends_df, synthetic_df = find_csv_files(prefix + testbed + "/")

# output_csv_file = prefix + testbed + "/" + testbed + "_" + "friends_10_samples_30_range" + "_t" + str(threads) + "_" + precision + ".csv"
# friends_df.to_csv(output_csv_file, index=False, header=False, sep=',')
# output_csv_file = prefix + testbed + "/" + testbed + "_" + "synthetic" + "_t" + str(threads) + "_" + precision + ".csv"
# synthetic_df.to_csv(output_csv_file, index=False, header=False, sep=',')


CPU times: user 7 µs, sys: 1 µs, total: 8 µs
Wall time: 16.5 µs


In [32]:
%%time 
def fix_features(root, file):
    file_path = os.path.join(root, file)
    if('friends' in file_path):
        with open('feats_friends.csv') as f:
            features = f.readlines()
    if('synthetic' in file_path):
        with open('feats_synthetic.csv') as f:
            features = f.readlines()
    for i in range(len(features)):
        features[i] = features[i].strip('\n')
    
    with open(file_path, 'r') as f:
        lines = f.readlines()
        if('sell_C_s' not in file_path):
            lines0 = lines[0]
            lines = lines[1:]
    
    impl  = file.replace('_'+precision+'.csv','')
    impl2 = impl_dict[impl]
    # print(impl, impl2)
    filtered_lines = []
    if('sell_C_s' not in file_path):
        filtered_lines.append(lines0)
    if(len(features) == len(lines)):
        for i,j in zip(features, lines):
            j_stripped = j.split(','+impl2)[1]
            new_line = i.strip('\n') + ',' + impl2 + j_stripped
            filtered_lines.append(new_line)
    else:
        # print(file_path, "\t", "len(features)", len(features), "\t", "len(lines)", len(lines))
        cnt = 0
        for j in lines:
            j_spl = j.split(',')
            j_spl_new = ','.join(j_spl[0:5])

            i = features[cnt]
            i_spl = i.split(',')
            i_spl_new = ','.join(i_spl[0:5])

            while(i_spl_new != j_spl_new):
                cnt+=1
                i = features[cnt]
                i_spl = i.split(',')
                i_spl_new = ','.join(i_spl[0:5])

            if(i_spl_new == j_spl_new):
                
                j_stripped = j.split(','+impl2)[1]
                new_line = i.strip('\n') + ',' + impl2 + j_stripped
                filtered_lines.append(new_line)
            cnt+=1
#             print(j, cnt)

    # Backup the original file
    backup_file_path = file_path.replace('.csv', '_BAD_FEATURES.csv')
    os.rename(file_path, backup_file_path)
    # Write the filtered content back to the original file path
    with open(file_path, 'w') as f:
        f.writelines(filtered_lines)
    # print(f"Original file backed up as: {backup_file_path}")
    # print(f"Filtered file saved as: {file_path}\n")
testbed = "grace1-arm"
threads = 72

friends_df, synthetic_df = find_csv_files(prefix + testbed + "/")

output_csv_file = prefix + testbed + "/" + testbed + "_" + "friends_10_samples_30_range" + "_t" + str(threads) + "_" + precision + ".csv"
friends_df.to_csv(output_csv_file, index=False, header=False, sep=',')
output_csv_file = prefix + testbed + "/" + testbed + "_" + "synthetic" + "_t" + str(threads) + "_" + precision + ".csv"
synthetic_df.to_csv(output_csv_file, index=False, header=False, sep=',')


Number of lines (excluding header): 16190
Occurrences of 'synthetic'        : 15760
Difference in line count and 'synthetic' occurrences for file: /home/pmpakos/A/grace1-arm/intermediate/rep1/synthetic/csr_vector_sve_d.csv

Original file backed up as: /home/pmpakos/A/grace1-arm/intermediate/rep1/synthetic/csr_vector_sve_d_BAD.csv
Filtered file saved as: /home/pmpakos/A/grace1-arm/intermediate/rep1/synthetic/csr_vector_sve_d.csv

Number of lines (excluding header): 16190
Occurrences of 'synthetic'        : 16186
Difference in line count and 'synthetic' occurrences for file: /home/pmpakos/A/grace1-arm/intermediate/rep1/synthetic/sparsex_d.csv

Original file backed up as: /home/pmpakos/A/grace1-arm/intermediate/rep1/synthetic/sparsex_d_BAD.csv
Filtered file saved as: /home/pmpakos/A/grace1-arm/intermediate/rep1/synthetic/sparsex_d.csv

Number of lines (excluding header): 3637
Occurrences of 'synthetic'        : 3570
Difference in line count and 'synthetic' occurrences for file: /home/pmpa

In [34]:
%%time 
testbed = "grace1-H100"

friends_df, synthetic_df = find_csv_files(prefix + testbed + "/")

output_csv_file = prefix + testbed + "/" + testbed + "_" + "friends_10_samples_30_range" + "_" + precision + ".csv"
friends_df.to_csv(output_csv_file, index=False, sep=',')
output_csv_file = prefix + testbed + "/" + testbed + "_" + "synthetic" + "_" + precision + ".csv"
synthetic_df.to_csv(output_csv_file, index=False, sep=',')


Number of lines (excluding header): 16190
Occurrences of 'synthetic'        : 15650
Difference in line count and 'synthetic' occurrences for file: /home/pmpakos/A/grace1-H100/intermediate/rep1/synthetic/merge_cuda_nv_d.csv

Original file backed up as: /home/pmpakos/A/grace1-H100/intermediate/rep1/synthetic/merge_cuda_nv_d_BAD.csv
Filtered file saved as: /home/pmpakos/A/grace1-H100/intermediate/rep1/synthetic/merge_cuda_nv_d.csv

Number of lines (excluding header): 16865
Occurrences of 'synthetic'        : 13445
Difference in line count and 'synthetic' occurrences for file: /home/pmpakos/A/grace1-H100/intermediate/rep1/synthetic/csr_cuda_adaptive_b256_mb1_nv_d.csv

Original file backed up as: /home/pmpakos/A/grace1-H100/intermediate/rep1/synthetic/csr_cuda_adaptive_b256_mb1_nv_d_BAD.csv
Filtered file saved as: /home/pmpakos/A/grace1-H100/intermediate/rep1/synthetic/csr_cuda_adaptive_b256_mb1_nv_d.csv

Number of lines (excluding header): 3637
Occurrences of 'synthetic'        : 3633
Diff