In [None]:
# Imports
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import seaborn as sns
import seaborn.objects as so

import os
import math

# Seaborn theme
sns.set_style('whitegrid')

# Functions
def safe_read_csv(file_path):
    try:
        return pd.read_csv(file_path)
    except Exception:
        return None

In [None]:
# Data config and such
bucket_numbers = [1, 16, 32, 64, 128, 256]

# Load all data into memory
dump_folder = "dumps/"

app_names_read = sorted([app_name for app_name in os.listdir(dump_folder)])
app_names = [s.replace('_', ' ') for s in app_names_read]

stats_dfs = list()
tags_dfs = list()
recvs_dfs = list()

for app in app_names_read:
    local_stats_dfs = list()
    local_tags_dfs = list()
    local_recvs_dfs = list()
    for bucket_size in bucket_numbers:
        local_stats_dfs.append(safe_read_csv(os.path.join(dump_folder, app, "df{}".format(bucket_size), "stats.csv")))
        local_tags_dfs.append(safe_read_csv(os.path.join(dump_folder, app, "df{}".format(bucket_size), "tags.csv")))
        local_recvs_dfs.append(safe_read_csv(os.path.join(dump_folder, app, "df{}".format(bucket_size), "recvs.csv")))

    stats_dfs.append(local_stats_dfs)
    tags_dfs.append(local_tags_dfs)
    recvs_dfs.append(local_recvs_dfs)

In [None]:
# Relative percentage of p2p vs collectives vs rdma

instructions_res = []
for (app_name, stat_dfs) in zip(app_names, stats_dfs):   
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue

    stat_df = stat_dfs[0]

    # total_rdma = stat_df['number_rdma'].sum()
    total_p2p = stat_df['number_p2p'].sum()
    total_collectives = stat_df['number_collectives'].sum()
    # total_inst = total_rdma + total_p2p + total_collectives
    total_inst = total_p2p + total_collectives

    # row_val = (app_name,  total_p2p / total_inst * 100, total_collectives / total_inst * 100, total_rdma / total_inst * 100)
    row_val = (app_name,  total_p2p / total_inst * 100, total_collectives / total_inst * 100)
    instructions_res.append(row_val)

# instructions_df = pd.DataFrame(instructions_res, columns=['Application', 'Point-to-point', 'Collectives', 'One-sided'])
instructions_df = pd.DataFrame(instructions_res, columns=['Application', 'Point-to-point', 'Collectives'])

ax = instructions_df.plot(kind='bar', stacked=True, x='Application')
plt.ylabel('Distribution of MPI calls')
plt.legend(loc='lower center')
ax.yaxis.set_major_formatter(ticker.PercentFormatter())

plt.savefig("pictures/app-ops-analysis.svg")

In [None]:
# Number of tags used (number unique tags vs application number)
# Should I use a scatter plot?

unique_tags_per_app = dict()
for (app_name, tag_dfs) in zip(app_names, tags_dfs): 
    # Skip empty applications
    if (len(tag_dfs) != len(bucket_numbers)) or tag_dfs[0] is None:
        continue

    tag_df = tag_dfs[0]
    unique = len(tag_df.columns) - 3 # Columns are: rank, op, kind, tag#1, tag#2, ..., tag#n
    
    # Skip collective-only applications
    if unique == 0:
        continue

    if unique not in unique_tags_per_app:
        unique_tags_per_app[unique] = 0 # Init value    

    unique_tags_per_app[unique] += 1

condensed_unique_tags = [0,0,0,0,0,0] # Data is splitted by 1 tag, 2 tags, 3 tags and 4+, 100+, 1000+ tags
for key in unique_tags_per_app:
    insert_in = 0
    if key >= 1000:
        insert_in = 5
    elif key >= 100:
        insert_in = 4
    elif key >= 10:
        insert_in = 3
    else:
        insert_in = key - 1

    condensed_unique_tags[insert_in] += unique_tags_per_app[key]

# Calculate aprox. 50% of applications
total_apps = len(app_names)
acc = 0
for (pos, unique_tags) in enumerate(condensed_unique_tags):
     acc += unique_tags
     if (acc >= math.floor(total_apps * 0.5)):
          plt.axvline(pos - 0.5, color='red')
          print(acc)
          break

tag_usage_df = dict(zip(['1', '2', '3', '10+', '100+', '1000+'], condensed_unique_tags))

ax = sns.barplot(tag_usage_df)
plt.xlabel('Number of unique tags used')
plt.ylabel('Number of applications')

for axis in [ax.xaxis, ax.yaxis]:
            axis.set_major_locator(ticker.MaxNLocator(integer=True))

# plt.savefig("pictures/app-tag-usage.svg")

In [None]:
# Number of unique pair source-tag used vs application number
# Should I use a scatter plot?

combinations = pd.DataFrame(columns=['app', 'unique_pairs', 'total_pairs'])
for (app_name, recv_dfs) in zip(app_names, recvs_dfs): 
    # Skip empty applications
    if (len(recv_dfs) != len(bucket_numbers)) or recv_dfs[0] is None:
        continue

    recv_df = recv_dfs[0]

    uniques = len(recv_df[['src', 'tag']].drop_duplicates())
    total = len(recv_df)

    if total == 0:
        continue

    combinations.loc[len(combinations)] = [app_name, uniques, total]

combinations['perc'] = combinations['unique_pairs'] / combinations['total_pairs'] * 100

# ax = sns.barplot(combinations.sort_values('perc', ascending=False), y='perc', x='app') # sort by %
ax = sns.barplot(combinations, y='perc', x='app') # no sort. app alphabetically
ax.tick_params(axis='x', rotation=90)
ax.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=None))

ax.set_xlabel("Application")
ax.set_ylabel("Unique combinations of source and tag in receives")

plt.savefig("pictures/app-unique-tags.svg")

In [None]:
# Ranks per application
for (app_name, stat_dfs) in zip(app_names, stats_dfs):
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue

    stat_df = stat_dfs[0]
    ranks = stat_df['rank'].max() + 1
    
    print("{:<40} = {:<4} processes".format(app_name, ranks))

In [None]:
# Tags used per application
for (app_name, tag_dfs) in zip(app_names, tags_dfs): 
    # Skip empty applications
    if (len(tag_dfs) != len(bucket_numbers)) or tag_dfs[0] is None:
        continue

    tag_df = tag_dfs[0]
    unique = len(tag_df.columns) - 3 # Columns are: rank, op, kind, tag#1, tag#2, ..., tag#n
    
    print("{:<40} = {:<4} unique tags".format(app_name, unique))

In [None]:
# Presence of wildcards per app
for (app_name, stat_dfs) in zip(app_names, stats_dfs):
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue

    stat_df = stat_dfs[0]

    src_wildcards = stat_df[['recv_src_wildcard_sync', 'recv_src_wildcard_async']].agg({
        'recv_src_wildcard_sync': 'sum',
        'recv_src_wildcard_async': 'sum',
    }).sum()

    tag_wildcards = stat_df[['recv_tag_wildcard_sync', 'recv_tag_wildcard_async']].agg({
        'recv_tag_wildcard_sync': 'sum',
        'recv_tag_wildcard_async': 'sum',
    }).sum()

    double_wildcards = stat_df[['recv_double_wildcard_sync', 'recv_double_wildcard_async']].agg({
        'recv_double_wildcard_sync': 'sum',
        'recv_double_wildcard_async': 'sum',
    }).sum()

    print("{:<40} src = {:<6} tag = {:<6} double = {:<6}".format(app_name, src_wildcards, tag_wildcards, double_wildcards))


In [None]:
# Timeline of max. queue depth for each app
# Ranks per app are grouped and the max (change operation to check) value is taken

for (app_name, stat_dfs) in zip(app_names, stats_dfs):   
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue

    # Top ylim for bottom ax comparing 1 bin vs 16 bins
    ylim_start = (stat_dfs[1])['collisions_no_wildcard_max_len'].max()

    # More precise to which plots are halfs and which aren't. Shows less lines on top half
    ylim_end = (stat_dfs[0])['collisions_no_wildcard_max_len'].max() - abs(pow((stat_dfs[0])['collisions_no_wildcard_max_len'].mean(), 0.75))
    # Shows the same number of lines on each half
    # ylim_end = (stat_dfs[0])['collisions_no_wildcard_max_len'].max() - ylim_start

    dist = abs(ylim_start - ylim_end)

    # print(f"dist = {dist} vs ylim_start = {ylim_start} | {app_name}")

    plot_df = pd.DataFrame()
    for (bucket_size, stat_df) in zip(bucket_numbers, stat_dfs):
        # local_df = stat_df[['rank', 'collisions_no_wildcard_max_len', 'collisions_src_wildcard_max_len', 
        #                     'collisions_tag_wildcard_max_len', 'collisions_double_wildcard_max_len']].copy()
                
        local_df = stat_df[['rank', 'collisions_no_wildcard_max_len']].copy()

        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'collisions_no_wildcard_max_len': 'max'
        }).reset_index(drop=True)     

        plot_df[str(bucket_size)] = local_df.copy()
    
    if dist < ylim_start:
        fig, ax = plt.subplots()

        sns.lineplot(plot_df, ax=ax)

        ax.set_title(f"Timeline of max. queue depth for {app_name}")
        ax.set_xlabel("Runtime")
        ax.set_ylabel("Max. Queue depth", loc='bottom')
        ax.legend(title="Total bins")

        ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

        ax.set_xticks([])
    else:
        fig, (ax_top, ax_bottom) = plt.subplots(nrows=2, sharex=True, gridspec_kw={'hspace':0.05})

        d = .015
        kwargs = dict(transform=ax_top.transAxes, color='k', clip_on=False)
        ax_top.plot((-d, +d), (-d, +d), **kwargs)
        kwargs.update(transform=ax_bottom.transAxes)
        ax_bottom.plot((-d, +d), (1 - d, 1 + d), **kwargs)

        sns.lineplot(plot_df, ax=ax_top)
        sns.lineplot(plot_df, ax=ax_bottom)
        
        ax_top.set_ylim(bottom=ylim_end)
        ax_bottom.set_ylim(0, ylim_start + 1.0)

        ax_bottom.legend_.remove()

        ax_top.set_title(f"Timeline of max. queue depth for {app_name}")
        ax_bottom.set_xlabel("Runtime")
        ax_top.set_ylabel("Max. Queue depth", loc='bottom')
        ax_top.legend(title="Total bins")
    
        ax_top.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
        ax_bottom.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

        ax_bottom.set_xticks([])

In [None]:
# Timeline of %occupancy bins
# Ranks per app and grouped and the min (change operation to check) value is taken

for (app_name, stat_dfs) in zip(app_names, stats_dfs):   
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue

    plot_df = pd.DataFrame()
    for (bucket_size, stat_df) in zip(bucket_numbers, stat_dfs):
        # local_df = stat_df[['rank', 'collisions_no_wildcard_max_len', 'collisions_src_wildcard_max_len', 
        #                     'collisions_tag_wildcard_max_len', 'collisions_double_wildcard_max_len']].copy()
                
        local_df = stat_df[['rank', 'empty_bins_perc_no_wildcard']].copy()

        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'empty_bins_perc_no_wildcard': 'min'
        }).reset_index(drop=True)     

        plot_df[str(bucket_size)] = 1.0 - local_df.copy()
   
    plot_df = plot_df * 100

    fig, ax = plt.subplots()
    sns.lineplot(plot_df, ax=ax)
    
    plt.title(f"Timeline of % occupied bins for {app_name}")
    plt.xlabel("Runtime")
    plt.ylabel("Occupied bins")
    plt.legend(title="Total bins")
    
    ax.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=None))
    ax.set_xticks([])

    plt.tight_layout()

In [None]:
# Timeline of max. collisions for each app
# Ranks per app are grouped and the max (change operation to check) value is taken

for (app_name, stat_dfs) in zip(app_names, stats_dfs):   
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue

    # Top ylim for bottom ax comparing 1 bin vs 16 bins
    ylim_start = (stat_dfs[1])['collisions_no_wildcard'].max()

    # More precise to which plots are halfs and which aren't. Shows less lines on top half
    ylim_end = (stat_dfs[0])['collisions_no_wildcard'].max() - abs(pow((stat_dfs[0])['collisions_no_wildcard'].mean(), 0.75))
    # Shows the same number of lines on each half
    # ylim_end = (stat_dfs[0])['collisions_no_wildcard_max_len'].max() - ylim_start

    dist = abs(ylim_start - ylim_end)

    # print(f"dist = {dist} vs ylim_start = {ylim_start} | {app_name}")

    plot_df = pd.DataFrame()
    for (bucket_size, stat_df) in zip(bucket_numbers, stat_dfs):
        # local_df = stat_df[['rank', 'collisions_no_wildcard_max_len', 'collisions_src_wildcard_max_len', 
        #                     'collisions_tag_wildcard_max_len', 'collisions_double_wildcard_max_len']].copy()
                
        local_df = stat_df[['rank', 'collisions_no_wildcard']].copy()

        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'collisions_no_wildcard': 'max'
        }).reset_index(drop=True)     

        plot_df[str(bucket_size)] = local_df.copy()
    
    if dist < ylim_start:
        fig, ax = plt.subplots()

        sns.lineplot(plot_df, ax=ax)

        ax.set_title(f"Timeline of max. collisions for {app_name}")
        ax.set_xlabel("Runtime")
        ax.set_ylabel("Max. Collisions", loc='bottom')
        ax.legend(title="Total bins")

        ax.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

        ax.set_xticks([])
    else:
        fig, (ax_top, ax_bottom) = plt.subplots(nrows=2, sharex=True, gridspec_kw={'hspace':0.05})

        d = .015
        kwargs = dict(transform=ax_top.transAxes, color='k', clip_on=False)
        ax_top.plot((-d, +d), (-d, +d), **kwargs)
        kwargs.update(transform=ax_bottom.transAxes)
        ax_bottom.plot((-d, +d), (1 - d, 1 + d), **kwargs)

        sns.lineplot(plot_df, ax=ax_top)
        sns.lineplot(plot_df, ax=ax_bottom)
        
        ax_top.set_ylim(bottom=ylim_end)
        ax_bottom.set_ylim(0, ylim_start + 1.0)

        ax_bottom.legend_.remove()

        ax_top.set_title(f"Timeline of max. queue depth for {app_name}")
        ax_bottom.set_xlabel("Runtime")
        ax_top.set_ylabel("Max. Queue depth", loc='bottom')
        ax_top.legend(title="Total bins")
    
        ax_top.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
        ax_bottom.yaxis.set_major_locator(plt.MaxNLocator(integer=True))

        ax_bottom.set_xticks([])

In [None]:
# Timeline of collisions for each app
# Ranks per app and grouped and the max (change operation to check) value is taken

for (app_name, stat_dfs) in zip(app_names, stats_dfs):   
    # Skip empty applications
    if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
        continue
    
    max_rank = stat_dfs[0]['rank'].max() + 1

    # Limit of ranks to plot. Applications with more than the limit are skipped (too much clutter)
    if max_rank > 8:
        continue

    num_rows = (max_rank + 4 - 1) // 4

    fig, ax = plt.subplots(nrows = num_rows, ncols = 4, sharex=True, sharey=True, figsize=(10,2))

    for rank in range(max_rank):
        plot_df = pd.DataFrame()        
        for (bucket_size, stat_df) in zip(bucket_numbers, stat_dfs):
            selection_df = stat_df.loc[stat_df['rank'] == rank]
            selection_df = selection_df['collisions_no_wildcard_max_len'].reset_index(drop=True)  

            plot_df[str(bucket_size)] = selection_df.copy()

        selected = plt.subplot(num_rows, 4, rank + 1)
        selected.plot(plot_df)
        selected.yaxis.set_major_locator(plt.MaxNLocator(integer=True))
        selected.set_xticks([])
    
    fig.legend(bucket_numbers, title='Bucket Size')
    fig.text(0.5, 0.04, 'Runtime', ha='center', va='center')

In [None]:
# Sumarize apps using configured bins. Get max and average and plot of bins usage
# Plotted using 1, 32 and 128 bins

bins = [1, 32, 128]
bins_pos = [0, 2, 4]

plot_df = pd.DataFrame(columns=['app','Num. Bins', 'max', 'avg'])
avgs = list()
for (bin_pos, bin) in zip(bins_pos, bins):
    bin_avgs = list()
    max_avgs = list()
    for (app_name, stat_dfs) in zip(app_names, stats_dfs):
        # Skip empty applications
        if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
            continue

        # Bucket selected
        stat_df = stat_dfs[bin_pos]

        local_df = stat_df[['rank', 'empty_bins_perc_no_wildcard']].copy()

        my_max = (1.0 - local_df['empty_bins_perc_no_wildcard'].min()) * 100
    
        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'empty_bins_perc_no_wildcard': 'mean'
        }).reset_index(drop=True)

        my_mean = (1.0 - local_df['empty_bins_perc_no_wildcard'].min()) * 100
        
        bin_avgs.append(my_mean)
        max_avgs.append(my_max)

        plot_df.loc[len(plot_df)] = [app_name, bin, my_max, my_mean]
    
    avgs.append((pd.DataFrame(bin_avgs)[0].mean(), pd.DataFrame(max_avgs)[0].mean()))



plot_df = plot_df.sort_values(by=['Num. Bins', 'avg'], ascending=[True, False])

# plot_df

grid = sns.FacetGrid(plot_df, col='Num. Bins', sharey=True, sharex=False)
grid.map(sns.barplot, 'app', 'avg', label='Avg.')
grid.map(sns.scatterplot, 'app', 'max', label='Max.', color='orange')
grid.set_axis_labels('Application', 'Used bins')

grid.add_legend(title='Statistic')

for (ax, (avg, max)) in zip(grid.axes.flat, avgs):
    ax.axhline(avg, ls='--', color='r')
    # ax.axhline(max, ls='--', color='g')
    ax.tick_params(axis='x', rotation=90)
    ax.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=None))

# Ignore order warning, input is sorted beforehand

In [None]:
# Sumarize apps using configured bins. Get max and average and plot of queue depth
# Plotted using 1, 32 and 128 bins

bins = [1, 32, 128]
bins_pos = [0, 2, 4]

plot_df = pd.DataFrame(columns=['app','Num. Bins', 'max', 'avg'])
avgs = list()
for (bin_pos, bin) in zip(bins_pos, bins):
    bin_avgs = list()
    max_avgs = list()
    for (app_name, stat_dfs) in zip(app_names, stats_dfs):
        # Skip empty applications
        if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
            continue

        # Bucket selected
        stat_df = stat_dfs[bin_pos]

        local_df = stat_df[['rank', 'collisions_no_wildcard_max_len']].copy()

        my_max = local_df['collisions_no_wildcard_max_len'].max()

        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'collisions_no_wildcard_max_len': 'mean'
        }).reset_index(drop=True)


        my_mean = local_df['collisions_no_wildcard_max_len'].max()
        
        bin_avgs.append(my_mean)
        max_avgs.append(my_max)

        plot_df.loc[len(plot_df)] = [app_name, bin, my_max, my_mean]
    
    avgs.append((pd.DataFrame(bin_avgs)[0].mean(), pd.DataFrame(max_avgs)[0].mean()))



plot_df = plot_df.sort_values(by=['Num. Bins', 'avg'], ascending=[True, False])

# plot_df

grid = sns.FacetGrid(plot_df, col='Num. Bins', sharey=False, sharex=False)
grid.map(sns.barplot, 'app', 'avg', label='Avg.')
grid.map(sns.scatterplot, 'app', 'max', label='Max.', color='orange')
grid.set_axis_labels('Application', 'Queue depth [number of elements]')

grid.add_legend(title='Statistic')

for (ax, (avg, max)) in zip(grid.axes.flat, avgs):
    ax.axhline(avg, ls='--', color='r')
    # ax.axhline(max, ls='--', color='g')
    ax.tick_params(axis='x', rotation=90)

# Ignore order warning, input is sorted beforehand

plt.savefig("pictures/q-depth.svg")

In [None]:
# Sumarize apps using configured bins. Get max and average and plot of collisions
# Plotted using 1, 32 and 128 bins

bins = [1, 32, 128]
bins_pos = [0, 2, 4]

plot_df = pd.DataFrame(columns=['app','Num. Bins', 'max', 'avg'])
avgs = list()
for (bin_pos, bin) in zip(bins_pos, bins):
    bin_avgs = list()
    max_avgs = list()
    for (app_name, stat_dfs) in zip(app_names, stats_dfs):
        # Skip empty applications
        if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
            continue

        # Bucket selected
        stat_df = stat_dfs[bin_pos]

        local_df = stat_df[['rank', 'collisions_no_wildcard']].copy()

        my_max = local_df['collisions_no_wildcard'].max()

        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'collisions_no_wildcard': 'mean'
        }).reset_index(drop=True)


        my_mean = local_df['collisions_no_wildcard'].max()
        
        bin_avgs.append(my_mean)
        max_avgs.append(my_max)

        plot_df.loc[len(plot_df)] = [app_name, bin, my_max, my_mean]
    
    avgs.append((pd.DataFrame(bin_avgs)[0].mean(), pd.DataFrame(max_avgs)[0].mean()))



plot_df = plot_df.sort_values(by=['Num. Bins', 'avg'], ascending=[True, False])

# plot_df

grid = sns.FacetGrid(plot_df, col='Num. Bins', sharey=False, sharex=False)
grid.map(sns.barplot, 'app', 'avg', label='Avg.')
grid.map(sns.scatterplot, 'app', 'max', label='Max.', color='orange')
grid.set_axis_labels('Application', 'Number of collisions')

grid.add_legend(title='Statistic')

for (ax, (avg, max)) in zip(grid.axes.flat, avgs):
    ax.axhline(avg, ls='--', color='r')
    # ax.axhline(max, ls='--', color='g')
    ax.tick_params(axis='x', rotation=90)

# Ignore order warning, input is sorted beforehand

In [None]:
# Dimissing returns for increase in bins. ahhaha dont shows want I want

plot_df = pd.DataFrame(columns=['bin', 'avg'])

for (bin, bin_idx) in zip(bucket_numbers, range(0, len(bucket_numbers))):
    bin_avg_list = list()
    for (app_name, stat_dfs) in zip(app_names, stats_dfs):
        # Skip empty applications
        if (len(stat_dfs) != len(bucket_numbers)) or stat_dfs[0] is None:
            break

        local_df = stat_dfs[bin_idx][['rank', 'collisions_no_wildcard_max_len']].copy()
        
        local_df['group'] = local_df.groupby('rank').cumcount()
        local_df = local_df.groupby('group').agg({
            'collisions_no_wildcard_max_len': 'mean'
        }).reset_index(drop=True)

        my_mean = local_df['collisions_no_wildcard_max_len'].max()
        bin_avg_list.append(my_mean)
    
    plot_df.loc[len(plot_df)] = [str(bin), pd.DataFrame(bin_avg_list)[0].mean()]

plot_df['improvement'] = plot_df.iloc[0,1] / plot_df['avg']
plot_df['perc'] = (plot_df.iloc[0,1] - plot_df['avg']) / plot_df.iloc[0,1]

print(plot_df.iloc[0,1])

# plot_df = plot_df[:-1]

# plot_df['improvement'] = plot_df['avg'].div(plot_df['avg'].shift(1))
# plot_df = plot_df[1:]

ax = sns.lineplot(plot_df, x='bin', y='perc')
# ax.set_ylim([0.0, 30.0])

plot_df