In [441]:
import pandas as pd
import os
import seaborn as sns

# kill old plots
for f in os.listdir('plots/correctness_test_jiela/scaling'):
    os.remove(os.path.join('plots/correctness_test_jiela/scaling', f))
for f in os.listdir('plots/correctness_test_jiela/all_impls'):
    os.remove(os.path.join('plots/correctness_test_jiela/all_impls', f))

### Global params

In [442]:
num_warmups = 20  # with the full data this should probably be something like 20
consistency_check = False
implementations_to_ignore = [
    # 'coo_opt_vectorization_SDDMM_GPU',
    # 'merged',
    # 'naive_SDDMM_GPU',
    # 'naive_coo_SDDMM_GPU',
    # 'semi_naive_CSR_SDDMM_GPU'
    ]
log_scale = False


### Load data

In [443]:
# This list only contains the relevant paths!
base_paths = [
    # "/scratch/eschreib/results_full_run/",
    # "/scratch/eschreib/results_downloaded_size_generated/",
    # "/scratch/eschreib/results_downloaded_24k_generated/"
    # "/scratch/eschreib/results_MatrixMarket_10k/",
    # "/scratch/eschreib/results_MatrixMarket_24k/"
    "/scratch/eschreib/results_ABsquare/"
    ]

# All the irrelevant paths we have, if you want to test a specific one, put it into the base_paths list
    # "/scratch/eschreib/results/"
    # "/scratch/eschreib/results_K100/"
    # "/scratch/eschreib/results_MatrixMarket/"
    # "/scratch/eschreib/results_test/"

# The paths that are actually relevant
# "/scratch/eschreib/results_full_run/"
# "/scratch/eschreib/results_downloaded_size_generated/"
# "/scratch/eschreib/results_downloaded_24k_generated/"
# "/scratch/eschreib/results_MatrixMarket_10k/"
# "/scratch/eschreib/results_MatrixMarket_24k/"

# create the dataframe to which we can append our sub-dataframes
data = pd.DataFrame()

for base_path in base_paths:

    # collect all files that contain run data
    file_paths = []
    impl_name = os.listdir(base_path)
    for impl in impl_name:
        sizes = os.listdir(base_path + impl)
        # there is exactly one csv file per size_dir
        for size_dir in sizes:
            files = os.listdir(base_path + impl + "/" + size_dir)
            file_path = base_path + impl + "/" + size_dir + "/" + files[0]  # only one csv file per dir
            file_paths.append(file_path)
    sub_data = []
    for file in file_paths:
        with open(file, 'r') as fin:
            size = file.split("/")[-2]
            lines = fin.readlines()
            lines.pop(0)  # remove header line
            lines = list(map(lambda x: x.replace(",\n", ""), lines))  # remove trailing newline since they would create an extra column at the end
            lines = list(map(lambda x: x.split(","), lines))  # split into columns
            lines = list(map(lambda x: [x[0]] + ["0." + (x[3].split("/")[-1].split(".")[0].split("_")[-1][1:])] + x[1:], lines))
            lines = list(map(lambda x: [x[0]] + [size] + x[1:], lines))  # add size info
            sub_data += lines
    # make a subdata dataframe and add an indicator if we want it to be in the special configurations plot
    special_config = base_path != "/scratch/eschreib/results_full_run/"
        
    sub_data = pd.DataFrame(sub_data)
    sub_data["special_config"] = special_config
    # add the subdata to our main big dataframe
    data = pd.concat([data, sub_data], axis=0)

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,217,218,219,220,221,222,223,224,225,special_config
0,semi_naive_CSR_SDDMM_GPU,1000x1000,0.005,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,69632.001221,55296.000093,55296.000093,54271.999747,...,,,,,,,,,,True
1,semi_naive_CSR_SDDMM_GPU,1000x1000,0.001,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,35840.000957,19455.999136,18432.000652,18432.000652,...,,,,,,,,,,True
2,semi_naive_CSR_SDDMM_GPU,1000x1000,0.0005,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,30719.999224,15359.999612,15359.999612,14336.000197,...,,,,,,,,,,True
3,semi_naive_CSR_SDDMM_GPU,1000x1000,0.0001,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,24576.000869,12288.000435,11264.000088,11264.000088,...,,,,,,,,,,True
4,semi_naive_CSR_SDDMM_GPU,1000x1000,1e-05,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,22528.000176,12288.000435,11264.000088,10239.999741,...,,,,,,,,,,True
5,semi_naive_CSR_SDDMM_GPU,10000x10000,0.005,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,44125183.105469,44034046.173096,44058624.267578,44071937.561035,...,,,,,,,,,,True
6,semi_naive_CSR_SDDMM_GPU,10000x10000,0.001,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,8886272.43042,8821760.177612,8846336.364746,8878080.368042,...,,,,,,,,,,True
7,semi_naive_CSR_SDDMM_GPU,10000x10000,0.0005,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,4487167.835236,4485119.819641,4466688.156128,4461567.878723,...,,,,,,,,,,True
8,semi_naive_CSR_SDDMM_GPU,10000x10000,0.0001,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,979968.011379,957440.018654,959487.974644,955392.003059,...,,,,,,,,,,True
9,semi_naive_CSR_SDDMM_GPU,10000x10000,1e-05,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,/scratch/eschreib/matrices/Dataset_generated_m...,149504.005909,128000.00608,126975.998282,126975.998282,...,,,,,,,,,,True


### Preprocess data

In [444]:
# A helper function to turn the sparsities int percentages
def to_percent (a):
    # remove 3 characters
    a = a[3:]
    # We assume that we never have more than 9._%
    before_comma = a[0]
    a = a[1:]
    a = before_comma + "." + a + "%"

    return a

# remove warmup measurements
metadata = data.iloc[:, :3]
measurements = data.iloc[:, 3:]
measurements = measurements.iloc[:, num_warmups:]
data = pd.concat([metadata, measurements], axis=1)
print(data)
# remove all the broken runs (= the impls that start with "**")
data = data[~data[0].str.startswith("**")]

# prettify df
data = data.drop(data.columns[[3, 4, 5]], axis=1)  # rm the paths to the matrices
data = data.sort_values(by=[0])  # sort by implementation name
data.columns = range(data.shape[1])  # reset the column names to consecutive numbers
data.head()

sparsities = data[2].unique()
print(sparsities)
data[2] = data[2].apply(to_percent)

impls = data[0].unique()
sizes = data[1].unique()
sparsities = data[2].unique()
sparsities.sort()  # if I don't seaborn messes up the order of the x axis in some plots

print(impls)
print(sizes)
print(sparsities)

# data
# print(data)

# We now expand such that we can use seaborn
# specifically this means we only have one measurement per row
# d_exp stands for data expanded. Meaning each row only contains a single measurement
 
d_exp = pd.DataFrame(columns=['implementation', 'size', 'sparsity', 'ns', 'configuration'])

for row_idx in range(len(data)):

    row = data.iloc[row_idx]
    row_len = len(row)

    cur_imp = row[0]
    cur_size = row[1]
    cur_spars = row[2]
    cur_conf = row[row_len-1]
    current_configuration = "No"

    if cur_conf:
        current_configuration = "size: " + cur_size + " sparsity: " + cur_spars

    for measurement in row[3:]:

        if measurement is not None:
            new_row = {'implementation': cur_imp, 'size': cur_size, 'sparsity': cur_spars, 'ns': float(measurement), 'configuration':current_configuration}
            d_exp.loc[len(d_exp)] = new_row

configurations = d_exp["configuration"].unique()
configurations.sort()

d_exp

                                  0            1        2                23  \
0          semi_naive_CSR_SDDMM_GPU    1000x1000    0.005      53247.999400   
1          semi_naive_CSR_SDDMM_GPU    1000x1000    0.001      20479.999483   
2          semi_naive_CSR_SDDMM_GPU    1000x1000   0.0005      15359.999612   
3          semi_naive_CSR_SDDMM_GPU    1000x1000   0.0001      11264.000088   
4          semi_naive_CSR_SDDMM_GPU    1000x1000  0.00001      10239.999741   
5          semi_naive_CSR_SDDMM_GPU  10000x10000    0.005   44192768.096924   
6          semi_naive_CSR_SDDMM_GPU  10000x10000    0.001    8855551.719666   
7          semi_naive_CSR_SDDMM_GPU  10000x10000   0.0005    4451327.800751   
8          semi_naive_CSR_SDDMM_GPU  10000x10000   0.0001     959487.974644   
9          semi_naive_CSR_SDDMM_GPU  10000x10000  0.00001     130048.006773   
10  coo_opt_vectorization_SDDMM_GPU    1000x1000    0.005     212293.000000   
11  coo_opt_vectorization_SDDMM_GPU    1000x1000    

Unnamed: 0,implementation,size,sparsity,ns,configuration
0,coo_opt_vectorization_SDDMM_GPU,1000x1000,0.001%,23340.000000,size: 1000x1000 sparsity: 0.001%
1,coo_opt_vectorization_SDDMM_GPU,1000x1000,0.001%,23580.000000,size: 1000x1000 sparsity: 0.001%
2,coo_opt_vectorization_SDDMM_GPU,1000x1000,0.001%,23581.000000,size: 1000x1000 sparsity: 0.001%
3,coo_opt_vectorization_SDDMM_GPU,1000x1000,0.001%,23660.000000,size: 1000x1000 sparsity: 0.001%
4,coo_opt_vectorization_SDDMM_GPU,1000x1000,0.001%,23371.000000,size: 1000x1000 sparsity: 0.001%
...,...,...,...,...,...
3245,semi_naive_CSR_SDDMM_GPU,1000x1000,0.5%,53247.999400,size: 1000x1000 sparsity: 0.5%
3246,semi_naive_CSR_SDDMM_GPU,1000x1000,0.5%,54271.999747,size: 1000x1000 sparsity: 0.5%
3247,semi_naive_CSR_SDDMM_GPU,1000x1000,0.5%,54271.999747,size: 1000x1000 sparsity: 0.5%
3248,semi_naive_CSR_SDDMM_GPU,1000x1000,0.5%,53247.999400,size: 1000x1000 sparsity: 0.5%


### Scaling plots

In [445]:
def plotting_simple(data_to_plot, impl, iteration_metric, x_axis, x_axis_label):
    sns.set_theme(style="whitegrid")
    sns.set(rc={'figure.figsize':(30,8.27)})
    ax = sns.barplot(data= data_to_plot, x = x_axis, y = "ns", estimator='median', errorbar='ci')

    if log_scale:
        ax.set_yscale('log')

    if x_axis_label == "size":
        ax.set_title(impl + ", sparsity: " + str(iteration_metric))
    elif x_axis_label == "sparsity":
        ax.set_title(impl + ", size: " + str(iteration_metric))
    ax.set_ylabel("time in ns")
    ax.set_xlabel(x_axis_label)

    # save plot
    fig = ax.get_figure()
    # use a non standard seperator to read the asocited parts from the file name easier
    separator = "  "
    if x_axis_label == "size":
        fig.savefig("plots/correctness_test_jiela/scaling/impl_name=" + impl + separator + "sparsity=" + str(sparsity) + ".png")
    elif x_axis_label == "sparsity":
        fig.savefig("plots/correctness_test_jiela/scaling/impl_name=" + impl + separator + "size=" + size + ".png")
    fig.clf()  # reset seaborn to avoid stuff from accumulating



# 1: fix implementation, fix size, iterate over sparsity DAS HIER DU HORNOCHSE
for impl in impls:
    # This is only relevant if there is an implementation we don't want in our plot
    if impl in implementations_to_ignore:
         continue
    
    for size in sizes:

        # for sparsity in sparsities:  # there should be no need for .unique()

            d_to_plt = d_exp[d_exp['size'] == size]
            d_to_plt = d_to_plt[d_to_plt['implementation'] == impl]

            plotting_simple(d_to_plt, impl, size, "sparsity", "sparsity")
if consistency_check: print()

# 2: fix implementation, fix sparsity, iterate over size
it = 1
for impl in impls:
    # This is only relevant if there is an implementation we don't want in our plot
    if impl in implementations_to_ignore:
         continue
    
    for sparsity in sparsities:

            d_to_plt = d_exp[d_exp['implementation'] == impl]
            d_to_plt = d_to_plt[d_to_plt['sparsity'] == sparsity]

            plotting_simple(d_to_plt, impl, sparsity, "size", "size")
if consistency_check: print()

<Figure size 3000x827 with 0 Axes>

### All in one plots

In [446]:
from matplotlib.container import ErrorbarContainer


def grouped_plotting(data_to_plot, group_iterator, fixed_metric):

    sns.set(style="whitegrid")
    sns.set(rc={'figure.figsize':(30,8.27)})

    if group_iterator == "size":
        hue = "size"
        hue_order = sizes
    elif group_iterator == "sparsity":
        hue = "sparsity"
        hue_order = sparsities
    elif group_iterator == "configuration":
        hue = "configuration"
        hue_order = configurations
    
    ax = sns.barplot(x="implementation", y="ns", hue=hue, hue_order=hue_order, data=data_to_plot, estimator='median', errorbar='ci')

    if log_scale:
        ax.set_yscale('log')
        
    # print("doing all in one plot")
    sns.set(rc={'figure.figsize':(60,8.27)})
    separator = "  "
    fig = ax.get_figure()
    ax.set_xlabel("implementation")
    ax.set_ylabel("time in ns")
    # plt.yscale('log')
    if group_iterator == "sparsity":
        ax.set_title("size: " + str(fixed_metric))
        fig.savefig("plots/correctness_test_jiela/all_impls/size=" + str(fixed_metric) + ".png")
    elif group_iterator == "size":
        ax.set_title("sparsity: " + str(fixed_metric))
        fig.savefig("plots/correctness_test_jiela/all_impls/sparsity=" + str(fixed_metric) + ".png")
    elif group_iterator == "configuration":
        ax.set_title("Special Configurations")
        fig.savefig("plots/correctness_test_jiela/all_impls/special_config.png")
    fig.clf()  # reset seaborn to avoid stuff from accumulating
    

# 1. fix sparsity, group iterator = size, x-axis-ticks iterator = implementation
for sparsity in sparsities:

    d_to_plt = d_exp[d_exp['sparsity'] == sparsity]

    # This is only relevant if there is an implementation we don't want in our plot
    for impl in implementations_to_ignore:
        d_to_plt = d_to_plt[d_to_plt['implementation' != impl]]

    grouped_plotting(d_to_plt, "size", sparsity)


if consistency_check: print()

# 2. fix size, group iterator = sparsity, x-axis-ticks iterator = implementation
for size in sizes:
    
    d_to_plt = d_exp[d_exp['size'] == size]

    # This is only relevant if there is an implementation we don't want in our plot
    for impl in implementations_to_ignore:
        d_to_plt = d_to_plt[d_to_plt['implementation' != impl]]

    grouped_plotting(d_to_plt,"sparsity", size)

if consistency_check: print()

# 3. here we just have a grouped plot for the special config
d_to_plt = d_exp[d_exp['configuration'] != "No"]

# This is only relevant if there is an implementation we don't want in our plot
for impl in implementations_to_ignore:
    d_to_plt = d_to_plt[d_to_plt['implementation' != impl]]

grouped_plotting(d_to_plt, "configuration", "dummy")



<Figure size 3000x827 with 0 Axes>