In [None]:
%load_ext autoreload
%autoreload 2
import dt4dds_benchmark
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

data_initcov = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./data/initcov/{s}.hdf5').get_data() for s in (
    'aeon_high',
    'aeon_low',
    'aeon_medium',
    'fountain_high',
    'fountain_low',
    'fountain_medium',
    'goldman_default',
    'rs_high',
    'rs_low',
    'rs_medium',
    'hedges_low',
    'hedges_medium',
    'yinyang_default',
)])
data_seqdepth = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./data/seqdepth/{s}.hdf5').get_data() for s in (
    'aeon_high',
    'aeon_low',
    'aeon_medium',
    'fountain_high',
    'fountain_low',
    'fountain_medium',
    'goldman_default',
    'rs_high',
    'rs_low',
    'rs_medium',
    'hedges_low',
    'hedges_medium',
    'yinyang_default',
)])

In [None]:
colormap_type = {'DNAAeon': '#3182bd', 'DNAFountain': '#31a354', 'DNARS': '#e6550d', 'HEDGES': '#756bb1', 'YinYang': '#636363', 'Goldman': '#de2d26'}

colormap_name = {
    'DNAAeon-High': '#08519c', 'DNAAeon-Medium': '#3182bd', 'DNAAeon-Low': '#6baed6',
    'DNAFountain-High': '#006d2c', 'DNAFountain-Medium': '#31a354', 'DNAFountain-Low': '#74c476',
    'DNARS-High': '#a63603', 'DNARS-Medium': '#e6550d', 'DNARS-Low': '#fd8d3c',
    'HEDGES-Medium': '#756bb1', 'HEDGES-Low': '#9e9ac8',
    'YinYang-High': '#636363', 'Goldman-Low': '#de2d26',
}

### define functions to select pareto-optimal points

In [None]:
# see https://stackoverflow.com/questions/32791911/fast-calculation-of-pareto-front-in-python
def is_pareto_optimal(costs):
    is_efficient = np.all(np.logical_not(np.isnan(costs)), axis=1)
    for i, c in enumerate(costs):
        if is_efficient[i]:
            is_efficient[is_efficient] = np.any(costs[is_efficient]<c, axis=1)  # Keep any point with a lower cost
            is_efficient[i] = True  # And keep self
    return is_efficient
    
# function that receives a groupby subset and only returns the pareto front
def apply_pareto(group, cost1, cost2, cost1_max = False, cost2_max = False):
    # get the costs
    costs = group[[cost1, cost2]].values.astype(float)
    # invert the costs if they are maximization problems
    if cost1_max:
        costs[:,0] = -costs[:,0]
    if cost2_max:
        costs[:,1] = -costs[:,1]
    # get the pareto front
    return group[is_pareto_optimal(costs)]

# function that receives a groupby subset and returns the complete pareto front by adding extreme points if not present already
def complete_pareto_front(group):
    full = group.copy()
    if full['workflow.initial_coverage'].max() < 1000:
        add = pd.DataFrame.from_dict({'codec.type': full['codec.type'].iloc[0], 'codec.name': full['codec.name'].iloc[0], 'workflow.type': full['workflow.type'].iloc[0], 'workflow.initial_coverage': 1000, 'workflow.sequencing_depth': 0.999*full['workflow.sequencing_depth'].min()}, orient='index').T
        full = pd.concat([full, add], ignore_index=True).reset_index(drop=True)
    if full['workflow.sequencing_depth'].max() < 1000:
        add = pd.DataFrame.from_dict({'codec.type': full['codec.type'].iloc[0], 'codec.name': full['codec.name'].iloc[0], 'workflow.type': full['workflow.type'].iloc[0], 'workflow.initial_coverage': 0.999*full['workflow.initial_coverage'].min(), 'workflow.sequencing_depth': 1000}, orient='index').T
        full = pd.concat([full, add], ignore_index=True).reset_index(drop=True)
    return full.reset_index(drop=True)

### get the fits for both scenarios, and harmonize the dataframes

In [None]:
df_initcov = data_initcov.get_fits_by_group(['codec.type', 'codec.name', 'workflow.name', 'workflow.type', 'workflow.initial_coverage'], 'workflow.sequencing_depth', additional_agg={'code_rate': 'mean', 'n_sequences': 'mean', 'sequence_length': 'mean', 'n_bases': 'mean', 'filesize_bit': 'mean'})
df_initcov['workflow.sequencing_depth'] = df_initcov['threshold']

df_seqdepth = data_seqdepth.get_fits_by_group(['codec.type', 'codec.name', 'workflow.name', 'workflow.type', 'workflow.sequencing_depth'], 'workflow.initial_coverage', additional_agg={'code_rate': 'mean', 'n_sequences': 'mean', 'sequence_length': 'mean', 'n_bases': 'mean', 'filesize_bit': 'mean'})
df_seqdepth['workflow.initial_coverage'] = df_seqdepth['threshold']

df = pd.concat([df_initcov, df_seqdepth], ignore_index=True)
df['workflow.sequencing_depth'] = df['workflow.sequencing_depth'].astype(float)
df['workflow.initial_coverage'] = df['workflow.initial_coverage'].astype(float)

df['eff_code_rate'] = df['code_rate'].astype(float) / df['workflow.initial_coverage'].astype(float)
df['eff_storage_density'] = 122.2 * df['code_rate'].astype(float) / df['workflow.initial_coverage'].astype(float)
df['name'] = df['codec.type'] + '-' + df['codec.name']

### apply the pareto filter to remove non-optimal points for initial coverage + sequencing depth

In [None]:
idf = df.groupby(['codec.type', 'codec.name', 'workflow.type']).apply(apply_pareto, 'workflow.initial_coverage', 'workflow.sequencing_depth', cost1_max = False, cost2_max = False, include_groups=False).reset_index()

In [None]:
plotdf = idf.groupby(['codec.type', 'codec.name', 'workflow.type'])[['codec.type', 'codec.name', 'workflow.type', 'workflow.initial_coverage', 'workflow.sequencing_depth']].apply(complete_pareto_front).reset_index(drop=True)

plotdf = plotdf.sort_values(['codec.type', 'workflow.sequencing_depth', 'workflow.initial_coverage'])
plotdf['codec.name'] = plotdf['codec.name'].replace({'high': 'High', 'medium': 'Medium', 'low': 'Low'})
plotdf.loc[plotdf['codec.type'] == 'YinYang', 'codec.name'] = 'High'
plotdf.loc[plotdf['codec.type'] == 'Goldman', 'codec.name'] = 'Low'
plotdf['name'] = plotdf['codec.type'] + '-' + plotdf['codec.name']
plotdf

In [None]:
fig = px.line(
    plotdf,
    x='workflow.initial_coverage', 
    y='workflow.sequencing_depth', 
    log_y=True, 
    log_x=True, 
    color='codec.type', 
    facet_row='workflow.type', 
    facet_row_spacing=0.09,
    facet_col='codec.name',
    facet_col_spacing=0.06,
    markers=True,
    color_discrete_map=colormap_type,
    category_orders={'codec.name': ['High', 'Medium', 'Low'], 'workflow.type': ['BestCase', 'WorstCase']},
    range_x=[0.3, 500],
    range_y=[0.3, 500],
)
fig.update_layout(
    showlegend=False,
    width=320,
    height=240,
    margin=dict(l=0, r=10, t=20, b=0),
)
fig.add_hline(
    y=30,
    line_dash='dash',
    line_width=1,
)
fig.update_xaxes(dtick=1)
fig.update_yaxes(dtick=1)
fig.update_xaxes(title='Initial coverage', row=1)
fig.update_yaxes(title='Sequencing depth', col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.write_image('./figures/pareto_front_cov_by_rate.svg')
fig.write_image('./figures/pareto_front_cov_by_rate.png', scale=2)
fig.show()

### Get the storage density at a sequencing depth of 30x

In [None]:
# for each workflow.type, codec.type and codec.name, interpolate initial coverage vs. sequencing depth
storage_densities = []
for workflow_type in idf['workflow.type'].unique():
    for codec_type in idf['codec.type'].unique():
        for codec_name in idf['codec.name'].unique():
            subset = idf[(idf['workflow.type'] == workflow_type) & (idf['codec.type'] == codec_type) & (idf['codec.name'] == codec_name)]
            subset = subset.sort_values('workflow.sequencing_depth', ascending=True)
            if subset.shape[0] > 1:
                y = subset['workflow.initial_coverage'].astype(float)
                x = subset['workflow.sequencing_depth'].astype(float)
                red = np.interp([30.0], x, y)[0]
                storage_densities.append({
                    'workflow.type': workflow_type,
                    'codec.type': codec_type,
                    'codec.name': codec_name,
                    'code_rate': subset['code_rate'].mean(),
                    'physical_redundancy': red,
                    'storage_density': 113.7*subset['code_rate'].mean()/red,
                })

storage_densities = pd.DataFrame(storage_densities)
storage_densities['code_rate'] = storage_densities['code_rate'].map('{:,.2f}'.format)
storage_densities

In [None]:
plot_df = storage_densities.sort_values(['codec.type', 'code_rate'])
plot_df['codec.type'] = plot_df['codec.type'].replace({'YinYang': 'YY', 'Goldman': 'GM'})

fig = dt4dds_benchmark.analysis.plotting.tiered_bar(
    plot_df,
    "codec.type",
    "code_rate",
    "storage_density",
    color_by="workflow.type",
    color_discrete_map={'BestCase': '#3182bd', 'WorstCase': '#de2d26'},
)
fig.update_layout(
    width=350,
    height=120,
    margin=dict(l=0, r=2, t=10, b=30),
    showlegend=False,
)
fig.update_yaxes(title='Storage density / EB g<sup>-1</sup>', range=[0, 130])
fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.update_xaxes(
    tickfont_size=28/3, 
    tickangle=0,
)
fig.show()
fig.write_image(f'./figures/comp.svg')
fig.write_image(f'./figures/comp.png', scale=2)

### Assess performance

In [None]:
performancedf_dropout = pd.merge(data_initcov.combined_performances, data_initcov.results)
performancedf_dropout = performancedf_dropout.drop(performancedf_dropout.loc[performancedf_dropout['decoding_success'] == False].index)
performancedf_dropout = performancedf_dropout.drop(performancedf_dropout.loc[performancedf_dropout['identifier'] != b'decoding'].index)

performancedf_rate = pd.merge(data_seqdepth.combined_performances, data_seqdepth.results)
performancedf_rate = performancedf_rate.drop(performancedf_rate.loc[performancedf_rate['decoding_success'] == False].index)
performancedf_rate = performancedf_rate.drop(performancedf_rate.loc[performancedf_rate['identifier'] != b'decoding'].index)

performancedf = pd.concat([performancedf_dropout, performancedf_rate], ignore_index=True)
performancedf['codec.name'] = performancedf['codec.name'].map({
    'default': '1low',
    'low': '1low',
    'medium': '2medium',
    'high': '3high',
})
performancedf.loc[performancedf['codec.type'] == 'YinYang', 'codec.name'] = '3high'
performancedf.sort_values(['codec.type', 'codec.name', 'duration'], inplace=True)
performancedf['duration'] = performancedf['duration']/60

performancedf

In [None]:
fig = px.scatter(
    performancedf.loc[performancedf['workflow.type'] == 'BestCase'],
    x='workflow.initial_coverage',
    y='workflow.sequencing_depth',
    log_x=True,
    log_y=True,
    color='duration',
    facet_col='codec.name',
    facet_col_spacing=0.05,
    facet_row='codec.type',
    facet_row_spacing=0.03,
    range_color=(0, 60),
    color_continuous_scale='Inferno',
)

fig.for_each_xaxis(lambda xaxis: xaxis.update(range=[-0.5, 3.1], tickangle=0))
fig.for_each_yaxis(lambda yaxis: yaxis.update(range=[-0.5, 3.1]))
fig.update_xaxes(title='Physical coverage', row=1)
fig.update_xaxes(showticklabels=True, row=2)
fig.update_yaxes(title='Sequencing depth', col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    width=680,
    height=800,
    margin=dict(l=0, r=10, t=1, b=10),
    showlegend=False,
)
fig.update_coloraxes(colorbar={'orientation':'h', 'thickness': 20, 'y': 10.0})
fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.show()

fig.write_image(f'./figures/runtime_best.svg')
fig.write_image(f'./figures/runtime_best.png', scale=2)

In [None]:
fig = px.scatter(
    performancedf.loc[performancedf['workflow.type'] == 'WorstCase'],
    x='workflow.initial_coverage',
    y='workflow.sequencing_depth',
    log_x=True,
    log_y=True,
    color='duration',
    facet_col='codec.name',
    facet_col_spacing=0.05,
    facet_row='codec.type',
    facet_row_spacing=0.08,
    range_color=(0, 60),
    color_continuous_scale='Inferno',
)

fig.for_each_xaxis(lambda xaxis: xaxis.update(range=[0, 3.1], tickangle=0))
fig.for_each_yaxis(lambda yaxis: yaxis.update(range=[0, 3.1]))
fig.update_xaxes(title='Physical coverage', row=1)
# fig.update_xaxes(showticklabels=True, row=2)
fig.update_yaxes(title='Sequencing depth', col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    width=680,
    height=500,
    margin=dict(l=0, r=10, t=1, b=10),
    showlegend=False,
)
fig.update_coloraxes(colorbar={'orientation':'h', 'thickness': 20, 'y': 10.0})
fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.show()

fig.write_image(f'./figures/runtime_worst.svg')
fig.write_image(f'./figures/runtime_worst.png', scale=2)