In [1]:
%config Completer.use_jedi = False
import pandas as pd
import numpy as np
from scipy.stats import hmean
import matplotlib.pyplot as plt
import os

In [2]:
def csv_parser(log_file, results_file):
    results_list = []
    with open(log_file,"r") as fp:
        for line in fp:
            split = line.split()
            split2 = line.split(",")
            if(len(split)>0):
                if("pmpakos@" in split[0]):
                    matrix_name = ''
#                     print("---------")

            if(len(split)>2):
                if((split[0]=="INFO:") & (split[1]=="loading") & (split[2]=="Mtx")):
                    print(split[-1])
                    matrix_name = split[-1].split("/")[-1]
                    matrix_name = matrix_name.replace(".mtx","")
            if(len(split2)>2):
                if(split2[1]!=" matrix"):
                    line=line.replace("app",matrix_name).strip("DATA_CSV:,")
#                     line=line.replace(",","\t")
                    results_list.append(line)
                    print(line)

    file = open(results_file,"w")
    for res in results_list:
        file.write(res)
    file.close()

In [3]:
def clean_matrix_generation_log(start_gen, results_gen):
    fw = open(results_gen,"w")
    with open(start_gen) as fp:
        for line in fp:
            if(">>>> " in line):
                line = line.strip(">>>> ").replace(" ",",").replace(".mtx","")
                line_spl = line.split(",")
                line_spl[0] = line_spl[0].replace(".0_","_")
                line_spl[0] = line_spl[0].split("_")
                del line_spl[0][3]
                line_spl[0] = "_".join(line_spl[0])
                line_new = ",".join(line_spl)
                fw.write(line_new)
    fw.close()

def clean_results(start_list, results_list):
    for t1,t2 in zip(start_list,results_list):
        fw = open(t2,"w")
        with open(t1) as fp:
            for line in fp:
                if("DATA_CSV:," in line):
                    line_new = line.strip("DATA_CSV:,").replace(".0_","_")
                    fw.write(line_new)
        fw.close()
    
def read_results_single(result_file):
    header = ["matrix_name","original rows","original cols","original NNZs","padded rows","padded cols","padded NNZs","padding overhead[%]","num of runs","total run time[sec]","time[ms]/run","performance[GFLOPs]","performance (padded)[GFLOPs]","mem_footprint[MB]"]
    df = pd.read_table(result_file, delimiter =",", names=header)
    return df

def read_results(results_list, results_gen, results_csv):
    header = ["matrix_name","mem_range","avg_nnz_per_row","std_nnz_per_row","avg_bw","std_bw","avg_sc","std_sc"]
    df_gen = pd.read_table(results_gen, delimiter =",", names=header) 
    
    dataframes = []
    for result_file in results_list:
        dataframes.append(read_results_single(result_file))
    matrices = dataframes[0]["matrix_name"].unique()

    n_runs = len(results_list)
    results_final = []
    for matrix in matrices:
#         print(matrix)
        matrix_list = matrix.split("_")
        mtx_name = matrix_list[0]
        
        nr_rows = int(matrix_list[1])
        nr_cols = int(matrix_list[2])
        
        if(len(matrix_list)==7):#random
            placement = "random"
            diagonal_factor = 1
        else: #diagonal
            placement = "diagonal"
            diagonal_factor = matrix_list[6]

        seed = matrix_list[-1].strip("n").strip("g")
        
        distr = list(filter(lambda x: x.isalpha(), matrix_list[-1]))[0]
        if(distr=="g"):
            distribution="gamma"
        else:
            distribution="normal"

        nr_nnz = np.asarray(dataframes[0][dataframes[0]["matrix_name"]==matrix]["original NNZs"])[0]
        density = nr_nnz/(nr_rows*nr_cols)*100
                
        runtime_iter = np.zeros(shape=(n_runs,1))
        perf_padded = np.zeros(shape=(n_runs,1))
        mem_footprint = np.zeros(shape=(n_runs,1))

        for run in range(0,n_runs):
            df = dataframes[run]
            runtime_iter[run] = np.asarray(df[df["matrix_name"]==matrix]["time[ms]/run"])[0]
            perf_padded[run] = np.asarray(df[df["matrix_name"]==matrix]["performance (padded)[GFLOPs]"])[0]
            mem_footprint[run] = np.asarray(df[df["matrix_name"]==matrix]["mem_footprint[MB]"])[0]
        if(perf_padded[0][0]<0): # why? nobody knows why
            continue

        runtime_iter_hm = hmean(runtime_iter,axis=0)[0]
        perf_padded_hm = hmean(perf_padded,axis=0)[0]
        mem_footprint = hmean(mem_footprint,axis=0)[0]
        
        selected_gen = df_gen[df_gen["matrix_name"]==matrix]
        mem_range = np.asarray(selected_gen["mem_range"])[0]
        avg_nnz_per_row = np.asarray(selected_gen["avg_nnz_per_row"])[0]
        std_nnz_per_row = np.asarray(selected_gen["std_nnz_per_row"])[0]
        avg_bw = np.asarray(selected_gen["avg_bw"])[0]
        std_bw = np.asarray(selected_gen["std_bw"])[0]
        avg_sc = np.asarray(selected_gen["avg_sc"])[0]
        std_sc = np.asarray(selected_gen["std_sc"])[0]/nr_cols

        W_avg = 33
        J_estimated = W_avg*runtime_iter_hm
        
        line_list = [mtx_name, distribution, placement, diagonal_factor, seed,
                     nr_rows, nr_cols, nr_nnz, density, mem_footprint, mem_range,
                     avg_nnz_per_row, std_nnz_per_row, 
                     avg_bw, std_bw, avg_sc, std_sc,
                     "Xilinx_SpMV", runtime_iter_hm, perf_padded_hm, W_avg, J_estimated
                    ]
        line = ",".join(str(x) for x in line_list) + "\n"
        results_final.append(line)
        
    file = open(results_csv,"w")
    for line in results_final:
        file.write(line)
    file.close()
    print("Results for",len(results_final),"matrices for",results_csv.split("/")[-1])

    
def extract_results_of_distr_memrange(distr_memrange):
    start_gen = "./generation_stats/"+distr_memrange+"_log.txt"
    results_gen = "./generation_stats/"+distr_memrange+"_log_CLEAN.txt"
    clean_matrix_generation_log(start_gen, results_gen)

    start_list = [
        "./dirty/"+distr_memrange+"_run1.txt",
    ]

    results_list = [
        "./clean/"+distr_memrange+"_run_CLEAN1.txt",
    ]
    clean_results(start_list, results_list)

    results_csv = "./results_"+distr_memrange+".csv"
    read_results(results_list, results_gen, results_csv)

In [4]:
%%time
distr_memrange = "normal_4-8"
extract_results_of_distr_memrange(distr_memrange)

Results for 764 matrices for results_normal_4-8.csv
CPU times: user 1.99 s, sys: 22.9 ms, total: 2.01 s
Wall time: 2.42 s


In [5]:
%%time
distr_memrange = "normal_8-16"
extract_results_of_distr_memrange(distr_memrange)

Results for 864 matrices for results_normal_8-16.csv
CPU times: user 2.28 s, sys: 23.7 ms, total: 2.3 s
Wall time: 2.39 s


In [6]:
%%time
distr_memrange = "normal_16-32"
extract_results_of_distr_memrange(distr_memrange)

Results for 844 matrices for results_normal_16-32.csv
CPU times: user 2.21 s, sys: 3.88 ms, total: 2.22 s
Wall time: 2.34 s


In [7]:
%%time
distr_memrange = "normal_32-64"
extract_results_of_distr_memrange(distr_memrange)

Results for 1212 matrices for results_normal_32-64.csv
CPU times: user 3.47 s, sys: 27.4 ms, total: 3.5 s
Wall time: 3.83 s


In [8]:
%%time
distr_memrange = "normal_64-128"
extract_results_of_distr_memrange(distr_memrange)

Results for 1232 matrices for results_normal_64-128.csv
CPU times: user 3.28 s, sys: 28.2 ms, total: 3.3 s
Wall time: 3.66 s


In [4]:
%%time
distr_memrange = "normal_128-256"
extract_results_of_distr_memrange(distr_memrange)

Results for 1118 matrices for results_normal_128-256.csv
CPU times: user 2.96 s, sys: 13.4 ms, total: 2.98 s
Wall time: 3.22 s


In [10]:
%%time
distr_memrange = "normal_256-512"
extract_results_of_distr_memrange(distr_memrange)

Results for 868 matrices for results_normal_256-512.csv
CPU times: user 2.26 s, sys: 23.9 ms, total: 2.29 s
Wall time: 2.67 s


In [11]:
%%time
distr_memrange = "normal_512-1024"
extract_results_of_distr_memrange(distr_memrange)

Results for 200 matrices for results_normal_512-1024.csv
CPU times: user 513 ms, sys: 131 µs, total: 514 ms
Wall time: 950 ms


In [12]:
%%time
distr_memrange = "normal_1024-2048"
extract_results_of_distr_memrange(distr_memrange)

Results for 0 matrices for results_normal_1024-2048.csv
CPU times: user 8.91 ms, sys: 0 ns, total: 8.91 ms
Wall time: 65 ms


In [13]:
%%time
distr_base = "normal"
mem_ranges = ["4-8","8-16","16-32","32-64","64-128","128-256","256-512","512-1024","1024-2048"]

filenames = ["results_"+distr_base+"_"+mr+".csv" for mr in mem_ranges]

with open('../Benchmarks/xilinx_spmv_4-2048_normal_dataset_dtype-D.csv', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            outfile.write(infile.read())

CPU times: user 2.75 ms, sys: 15.8 ms, total: 18.6 ms
Wall time: 714 ms


# all together

In [14]:
%%time

distr_base = "normal"
mem_ranges = ["4-8","8-16","16-32","32-64","64-128","128-256","256-512","512-1024","1024-2048"]

for mem_range in mem_ranges:
    distr_memrange = distr_base + "_" + mem_range
    extract_results_of_distr_memrange(distr_memrange)

filenames = ["results_"+distr_base+"_"+mr+".csv" for mr in mem_ranges]

with open('../Benchmarks/xilinx_spmv_4-2048_normal_dataset_dtype-D.csv', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            outfile.write(infile.read())

Results for 764 matrices for results_normal_4-8.csv
Results for 864 matrices for results_normal_8-16.csv
Results for 844 matrices for results_normal_16-32.csv
Results for 1212 matrices for results_normal_32-64.csv
Results for 1232 matrices for results_normal_64-128.csv
Results for 885 matrices for results_normal_128-256.csv
Results for 868 matrices for results_normal_256-512.csv
Results for 200 matrices for results_normal_512-1024.csv
Results for 0 matrices for results_normal_1024-2048.csv
CPU times: user 19.8 s, sys: 176 ms, total: 20 s
Wall time: 21.6 s


In [5]:
%%time

distr_base = "normal"
mem_ranges = ["4-8","8-16","16-32","32-64","64-128","128-256","256-512","512-1024","1024-2048"]

for mem_range in mem_ranges:
    distr_memrange = distr_base + "_" + mem_range
    extract_results_of_distr_memrange(distr_memrange)

filenames = ["results_"+distr_base+"_"+mr+".csv" for mr in mem_ranges]

with open('../Benchmarks/xilinx_spmv_4-2048_normal_dataset_dtype-D.csv', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            outfile.write(infile.read())

Results for 764 matrices for results_normal_4-8.csv
Results for 864 matrices for results_normal_8-16.csv
Results for 844 matrices for results_normal_16-32.csv
Results for 1212 matrices for results_normal_32-64.csv
Results for 1232 matrices for results_normal_64-128.csv
Results for 1118 matrices for results_normal_128-256.csv
Results for 908 matrices for results_normal_256-512.csv
Results for 200 matrices for results_normal_512-1024.csv
Results for 0 matrices for results_normal_1024-2048.csv
CPU times: user 18.9 s, sys: 143 ms, total: 19 s
Wall time: 22.2 s


---

---

In [15]:
def read_results2(results_list, results_csv):
    dataframes = []
    for result_file in results_list:
        dataframes.append(read_results_single(result_file))
    # get matrices sorted by mem_footprint and then begin processing them (so that I do not have to sort them later...)
    matrices = dataframes[0]["matrix_name"].unique()
    # matrices = dataframes[0].groupby(["matrix name","mem_footprint[MB]"]).size().reset_index().sort_values("mem_footprint[MB]").reset_index(drop=True)["matrix name"]
    
    n_runs = len(results_list)
    results_final = []
    for matrix in matrices:
        nr_rows = np.asarray(dataframes[0][dataframes[0]["matrix_name"]==matrix]["original rows"])[0]
        nr_cols = np.asarray(dataframes[0][dataframes[0]["matrix_name"]==matrix]["original cols"])[0]        
        nr_nnz = np.asarray(dataframes[0][dataframes[0]["matrix_name"]==matrix]["original NNZs"])[0]
        density = nr_nnz/(nr_rows*nr_cols)*100
        runtime_iter = np.zeros(shape=(n_runs,1))
        perf_padded = np.zeros(shape=(n_runs,1))
        mem_footprint = np.zeros(shape=(n_runs,1))

        for run in range(0,n_runs):
            df = dataframes[run]
            runtime_iter[run] = np.asarray(df[df["matrix_name"]==matrix]["time[ms]/run"])[0]
            perf_padded[run] = np.asarray(df[df["matrix_name"]==matrix]["performance (padded)[GFLOPs]"])[0]
            mem_footprint[run] = np.asarray(df[df["matrix_name"]==matrix]["mem_footprint[MB]"])[0]
        if(perf_padded[0][0]<0): # why? nobody knows why
            continue

        runtime_iter_hm = hmean(runtime_iter,axis=0)[0]
        perf_padded_hm = hmean(perf_padded,axis=0)[0]
        mem_footprint = hmean(mem_footprint,axis=0)[0]

        W_avg = 33
        J_estimated = W_avg*runtime_iter_hm
        
        line_list = [matrix, nr_rows, nr_cols, nr_nnz, density, mem_footprint, 
                     "Xilinx_SpMV", runtime_iter_hm, perf_padded_hm, W_avg, J_estimated
                    ]
        line = ",".join(str(x) for x in line_list) + "\n"
        results_final.append(line)

    file = open(results_csv,"w")
    for line in results_final:
        file.write(line)
    file.close()

In [16]:
start_list = [
    "./dirty/validation_matrices_run1.txt",
]

results_list = [
    "./clean/validation_matrices_run1.txt",
]
clean_results(start_list, results_list)

results_csv = "./results_validation_matrices.csv"
read_results2(results_list, results_csv)

---

In [5]:
def clean_progress_log(start_gen, clean_gen):
    fw = open(clean_gen,"w")
    cnt = 0
    with open(start_gen) as fp:
        for line in fp:
            if(  ("%|" not in line) and not(line=="\n") ):
                fw.write(line)
    fw.close()
    
prefix = "../artificial_matrix_generation/pmpakos_impl/matrix_generation_parameters/double/rest/progress_"
distr_memrange = "128-256"

for ID in range(1,7,1):
    start_gen = prefix+distr_memrange+"_"+str(ID)
    if(os.path.exists(start_gen)):
        clean_gen = prefix+distr_memrange+"_"+str(ID)+"_clean"
        print(ID)
        clean_progress_log(start_gen, clean_gen)