In [None]:
%load_ext autoreload
%autoreload 2
import dt4dds_benchmark
import plotly.express as px
import pandas as pd
import numpy as np

data_dropout = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./data/dropout/{s}.hdf5').get_data() for s in (
    'aeon_high',
    'aeon_low',
    'aeon_medium',
    'fountain_high',
    'fountain_low',
    'fountain_medium',
    'goldman_default',
    'rs_high',
    'rs_low',
    'rs_medium',
    'hedges_low',
    'hedges_medium',
    'yinyang_default',
)])
data_rate = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./data/rate/{s}.hdf5').get_data() for s in (
    'aeon_high',
    'aeon_low',
    'aeon_medium',
    'fountain_high',
    'fountain_low',
    'fountain_medium',
    'goldman_default',
    'rs_high',
    'rs_low',
    'rs_medium',
    'hedges_low',
    'hedges_medium',
    'yinyang_default',
)])

In [None]:
colormap_type = {'DNAAeon': '#3182bd', 'DNAFountain': '#31a354', 'DNARS': '#e6550d', 'HEDGES': '#756bb1', 'YinYang': '#636363', 'Goldman': '#de2d26'}

colormap_name = {
    'DNAAeon-High': '#08519c', 'DNAAeon-Medium': '#3182bd', 'DNAAeon-Low': '#6baed6',
    'DNAFountain-High': '#006d2c', 'DNAFountain-Medium': '#31a354', 'DNAFountain-Low': '#74c476',
    'DNARS-High': '#a63603', 'DNARS-Medium': '#e6550d', 'DNARS-Low': '#fd8d3c',
    'HEDGES-Medium': '#756bb1', 'HEDGES-Low': '#9e9ac8',
    'YinYang-High': '#636363', 'Goldman-Low': '#de2d26',
}

In [None]:
df_dropout = data_dropout.get_fits_by_group(['codec.type', 'codec.name', 'workflow.name', 'workflow.type', 'workflow.dropout'], 'workflow.overall_rate', additional_agg={'code_rate': 'mean'})
df_dropout['workflow.overall_rate'] = df_dropout['threshold']

df_rate = data_rate.get_fits_by_group(['codec.type', 'codec.name', 'workflow.name', 'workflow.type', 'workflow.overall_rate'], 'workflow.dropout', additional_agg={'code_rate': 'mean'})
df_rate['workflow.dropout'] = df_rate['threshold']

df = pd.concat([df_dropout, df_rate], ignore_index=True)

In [None]:
# see https://stackoverflow.com/questions/32791911/fast-calculation-of-pareto-front-in-python
def is_pareto_optimal(costs):
    is_efficient = np.all(np.logical_not(np.isnan(costs)), axis=1)
    for i, c in enumerate(costs):
        if is_efficient[i]:
            is_efficient[is_efficient] = np.any(costs[is_efficient]<c, axis=1)  # Keep any point with a lower cost
            is_efficient[i] = True  # And keep self
    return is_efficient
    
# function that receives a groupby subset and only returns the pareto front
def apply_pareto(group, cost1, cost2, cost1_max = False, cost2_max = False):
    # get the costs
    costs = group[[cost1, cost2]].values.astype(float)
    # invert the costs if they are maximization problems
    if cost1_max:
        costs[:,0] = -costs[:,0]
    if cost2_max:
        costs[:,1] = -costs[:,1]
    # get the pareto front
    return group[is_pareto_optimal(costs)]

# function that receives a groupby subset and returns the complete pareto front by adding extreme points if not present already
def complete_pareto_front(group):
    full = group.copy()
    if full['workflow.overall_rate'].min() > 0.0001:
        add = pd.DataFrame.from_dict({'codec.type': full['codec.type'].iloc[0], 'codec.name': full['codec.name'].iloc[0], 'workflow.type': full['workflow.type'].iloc[0], 'workflow.overall_rate': 0.0001, 'workflow.dropout': 1.001*full['workflow.dropout'].max()}, orient='index').T
        full = pd.concat([full, add], ignore_index=True).reset_index(drop=True)
    if full['workflow.dropout'].min() > 0.001:
        add = pd.DataFrame.from_dict({'codec.type': full['codec.type'].iloc[0], 'codec.name': full['codec.name'].iloc[0], 'workflow.type': full['workflow.type'].iloc[0], 'workflow.overall_rate': 1.001*full['workflow.overall_rate'].max(), 'workflow.dropout': 0.001}, orient='index').T
        full = pd.concat([full, add], ignore_index=True).reset_index(drop=True)
    return full.reset_index(drop=True)

In [None]:
idf = df.groupby(['codec.type', 'codec.name', 'workflow.type']).apply(apply_pareto, 'workflow.overall_rate', 'workflow.dropout', cost1_max = True, cost2_max = True, include_groups=False).reset_index()

In [None]:
plotdf = idf.groupby(['codec.type', 'codec.name', 'workflow.type'])[['codec.type', 'codec.name', 'workflow.type', 'workflow.overall_rate', 'workflow.dropout']].apply(complete_pareto_front).reset_index(drop=True)

plotdf = plotdf.sort_values(['codec.type', 'workflow.overall_rate', 'workflow.dropout'])
plotdf['codec.name'] = plotdf['codec.name'].replace({'high': 'High', 'medium': 'Medium', 'low': 'Low', 'default': 'Low'})
plotdf['name'] = plotdf['codec.type'] + '-' + plotdf['codec.name']
plotdf

In [None]:
fig = px.line(
    plotdf,
    x='workflow.overall_rate', 
    y='workflow.dropout', 
    log_y=True, 
    # log_x=True, 
    color='codec.type',
    facet_col='codec.name',
    facet_col_spacing=0.08,
    markers=True,
    color_discrete_map=colormap_type,
    category_orders={'codec.name': ['High', 'Medium', 'Low'],},
    range_x=[0.0, 0.15],
    range_y=[0.005, 1],
)
fig.update_layout(
    showlegend=False,
    width=340,
    height=140,
    margin=dict(l=0, r=10, t=20, b=0),
)
fig.add_vline(
    x=0.02,
    line_dash='dot',
    line_width=1,
    row=1,
)
fig.add_vline(
    x=0.0065,
    line_dash='dash',
    line_width=1,
    row=1,
)
fig.update_xaxes(dtick=0.05)
fig.update_yaxes(dtick=1)
fig.update_xaxes(title='Error rate per nt', tickformat=",.0%", row=1)
fig.update_yaxes(title='Sequence dropout', tickformat=",.0%", col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.write_image('./figures/pareto_front.svg')
fig.write_image('./figures/pareto_front.png', scale=2)
fig.show()

In [None]:
performancedf_dropout = pd.merge(data_dropout.combined_performances, data_dropout.results)
performancedf_dropout = performancedf_dropout.drop(performancedf_dropout.loc[performancedf_dropout['decoding_success'] == False].index)
performancedf_dropout = performancedf_dropout.drop(performancedf_dropout.loc[performancedf_dropout['identifier'] != b'decoding'].index)

performancedf_rate = pd.merge(data_rate.combined_performances, data_rate.results)
performancedf_rate = performancedf_rate.drop(performancedf_rate.loc[performancedf_rate['decoding_success'] == False].index)
performancedf_rate = performancedf_rate.drop(performancedf_rate.loc[performancedf_rate['identifier'] != b'decoding'].index)

performancedf = pd.concat([performancedf_dropout, performancedf_rate], ignore_index=True).reset_index()

performancedf['codec.name'] = performancedf['codec.name'].map({
    'default': '1low',
    'low': '1low',
    'medium': '2medium',
    'high': '3high',
})
performancedf.loc[performancedf['codec.type'] == 'YinYang', 'codec.name'] = '3high'
performancedf.sort_values(['codec.type', 'codec.name', 'duration'], inplace=True)
performancedf['duration'] = performancedf['duration']/60

performancedf

In [None]:
fig = px.scatter(
    performancedf,
    x='workflow.overall_rate',
    y='workflow.dropout',
    log_x=True,
    log_y=True,
    color='duration',
    facet_col='codec.name',
    facet_col_spacing=0.05,
    facet_row='codec.type',
    facet_row_spacing=0.03,
    range_color=(0, 60),
    color_continuous_scale='Inferno',
)

fig.for_each_xaxis(lambda xaxis: xaxis.update(range=[-3, 0], tickangle=0))
fig.for_each_yaxis(lambda yaxis: yaxis.update(range=[-3, 0]))
fig.update_xaxes(title='Error rate per nt', row=1)
fig.update_xaxes(showticklabels=True, row=2)
fig.update_yaxes(title='Sequence dropout', col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    width=680,
    height=800,
    margin=dict(l=0, r=10, t=1, b=10),
    showlegend=False,
)
fig.update_coloraxes(colorbar={'orientation':'h', 'thickness': 20, 'y': 10.0})
fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.show()

fig.write_image(f'./figures/runtime.svg')
fig.write_image(f'./figures/runtime.png', scale=2)