In [None]:
%load_ext autoreload
%autoreload 2
import dt4dds_benchmark
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

data = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./data/{w}/{s}.hdf5').get_data() for s in (
    'aeon_high',
    'aeon_low',
    'aeon_medium',
    'fountain_high',
    'fountain_low',
    'fountain_medium',
    'goldman_default',
    'rs_high',
    'rs_low',
    'rs_medium',
    'hedges_low',
    'hedges_medium',
    'yinyang_default',
) for w in (
    'basic',
    'cdhit',
    'clover',
    'lsh',
    'mmseqs2',
    'starcode',
)])

### check fit of simulation results

In [None]:
for c in data.separate_by_parameters(['codec.type', 'codec.name', 'clustering.name', 'clustering.type']):
    c.fit('workflow.overall_rate').plot(title_columns=['codec.type', 'codec.name', 'clustering.name', 'clustering.type']).show()

### exemplary plot

In [None]:
example_data = [data.only_with({'codec.type': 'DNAFountain', 'codec.name': 'high', 'clustering.type': cl}) for cl in ('BasicSet', 'CDHit')]

exampledf = pd.concat([e.combined_results.copy() for e in example_data])
exampledf['decoding_success'] = exampledf['decoding_success'].astype(float)

examplefits = [c.fit('workflow.overall_rate') for c in example_data]


fig = px.scatter(
    exampledf,
    x='workflow.overall_rate',
    y='decoding_success',
    log_x=True,
    color='clustering.type',
    color_discrete_map={'BasicSet': '#969696', 'CDHit': '#fb6a4a'},
)

for fit in examplefits:
    x_vals = np.logspace(-3, 0, 500)
    y_vals = fit.predict(x_vals)
    fig.add_scatter(
        x=x_vals, 
        y=y_vals, 
        mode='lines', 
        line_color='#636363' if (fit.data['clustering.type'] == 'BasicSet').any() else '#de2d26',
        line_width=2.5,
    )
    fig.add_vline(
        x=fit.threshold,
        line_dash='solid', 
        line_color='#636363' if (fit.data['clustering.type'] == 'BasicSet').any() else '#de2d26',
        line_width=1,
    )
    print(fit.threshold)

fig.update_xaxes(
    title='Error rate per nt',
    range=[-2.4, -0.698],
    dtick=1,
)
fig.update_yaxes(
    title='Recovery probability',
    tickformat=',.0%',
    range=[-0.03, 1.03]
)
fig.add_hline(y=0.95, line_dash='dash', line_color='#252525', line_width=2)

fig.add_vline(
    x=examplefits[1].threshold,
    line_dash='solid',
    line_width=2,
    line_color='#de2d26',
)
fig.add_vline(
    x=examplefits[0].threshold,
    line_dash='solid',
    line_width=2,
    line_color='#252525',
)

fig.update_layout(
    width=250,
    height=130,
    margin=dict(l=0, r=5, t=5, b=0),
    showlegend=False,
)

fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.show()
fig.write_image(f'./figures/example_fit.svg')
fig.write_image(f'./figures/example_fit.png', scale=2)

### get the threshold values by codec, clustering, and scenario

In [None]:
df = data.get_fits_by_group(['codec.type', 'codec.name', 'clustering.name', 'clustering.type'], on='workflow.overall_rate', additional_agg={'code_rate': 'mean'})
df['code_rate'] = df['code_rate'].map('{:.2f}'.format)

df

In [None]:
idf = df[['codec.type', 'code_rate', 'clustering.name', 'clustering.type', 'threshold']].copy()

# convert to wide by codec name and type
sdf = idf.pivot_table(columns=['clustering.type', 'clustering.name'], index=['codec.type', 'code_rate'], values='threshold', aggfunc='first').reset_index()
sdf

### plot only best-performing clustering and the basic clustering

In [None]:
plotdf = df.loc[df['clustering.type'] != 'MMseqs2'].copy()
plotdf['clustergroup'] = plotdf['clustering.type']
idf = df.copy()
idf['clustergroup'] = idf['clustering.type']

# keep only rows where the threshold is highest per codec.type and codec.name, but always also include the BasicSet row
plotdf = plotdf.loc[plotdf.groupby(['codec.type', 'codec.name'])['threshold'].idxmax()]
plotdf['clustergroup'] = 'optimal'
plotdf = pd.concat([plotdf, idf.loc[df['clustering.type'] == 'BasicSet']])
plotdf['codec.type'] = plotdf['codec.type'].str.replace('Goldman', 'GM').replace('YinYang', 'YY')

fig = dt4dds_benchmark.analysis.plotting.tiered_bar(
    plotdf.sort_values(['codec.type', 'code_rate', 'clustering.type']),
    "codec.type",
    "code_rate",
    "threshold",
    color_by = "clustergroup",
    color_discrete_map={'BasicSet': '#636363', 'optimal': '#31a354'},
)
fig.update_yaxes(
    range=[0, 0.15],
    title='Error rate per nt',
    # type="log",
)
fig.update_layout(
    width=320,
    height=140,
    margin=dict(l=0, r=1, t=10, b=30),
    showlegend=False,
)


fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.update_xaxes(
    tickfont_size=28/3, 
    tickangle=0,
)
fig.show()
fig.write_image('./figures/best.svg')
fig.write_image('./figures/best.png', scale=2)
display(plotdf.sort_values(['codec.type', 'code_rate', 'clustering.type'])[['codec.type', 'codec.name', 'clustering.type', 'threshold', 'code_rate']])

### get the median improvement of clustering vs. BasicSet

In [None]:
pivotdf = plotdf.copy()
pivotdf['clustering.type'] = pivotdf['clustering.type'].str.replace('LSH', 'optimal').replace('CDHit', 'optimal').replace('Starcode', 'optimal')
pivotdf = pivotdf.pivot_table(index=['codec.type', 'codec.name'], columns='clustering.type', values='threshold').reset_index()
pivotdf['ratio'] = pivotdf['optimal'] / pivotdf['BasicSet']
pivotdf['delta'] = pivotdf['optimal'] - pivotdf['BasicSet']

pivotdf

In [None]:
pivotdf['delta'].describe()

### Check constraints

In [None]:
performancedf = pd.merge(data.combined_performances, data.results)
# performancedf = performancedf.drop(performancedf.loc[performancedf['decoding_success'] == False].index)
performancedf = performancedf.drop(performancedf.loc[performancedf['identifier'] != b'decoding'].index)
performancedf['code_rate'] = performancedf['code_rate'].map('{:.2f}'.format)
performancedf.loc[performancedf['code_rate'] == '1.51', 'code_rate'] = '1.50'
performancedf.loc[performancedf['code_rate'] == '1.01', 'code_rate'] = '1.00'
performancedf.loc[performancedf['clustering.type'] == 'BasicSet', 'clustering.type'] = 'Naive'

performancedf['choosename'] = performancedf['codec.type'] + '_' + performancedf['codec.name'] + '_' + performancedf['clustering.type']
plotdf = performancedf.loc[performancedf['choosename'].isin((
    'DNAAeon_low_Naive',
    'DNAAeon_low_CDHit',
    'DNAAeon_medium_Naive',
    'DNAAeon_medium_CDHit',
    'DNAAeon_high_Naive',
    'DNAAeon_high_CDHit',
    'DNAFountain_low_Naive',
    'DNAFountain_low_Starcode',
    'DNAFountain_medium_Naive',
    'DNAFountain_medium_Starcode',
    'DNAFountain_high_Naive',
    'DNAFountain_high_CDHit',
    'DNARS_low_Naive',
    'DNARS_low_CDHit',
    'DNARS_medium_Naive',
    'DNARS_medium_CDHit',
    'DNARS_high_Naive',
    'DNARS_high_CDHit',
    'Goldman_default_Naive',
    'Goldman_default_LSH',
    'HEDGES_low_Naive',
    'HEDGES_low_LSH',
    'HEDGES_medium_Naive',
    'HEDGES_medium_CDHit',
    'YinYang_default_Naive',
    'YinYang_default_CDHit',
))].copy()
plotdf.loc[plotdf['clustering.type'] != 'Naive', 'clustering.type'] = 'Clustering'
plotdf['duration'] = plotdf['duration'] / 60
plotdf.loc[plotdf['codec.type'] == 'Goldman', 'codec.name'] = 'low'
plotdf.loc[plotdf['codec.type'] == 'YinYang', 'codec.name'] = 'high'

# sort codec.name column to low, medium, high
plotdf['codec.nameorder'] = plotdf['codec.name'].map({'low': 0, 'medium': 1, 'high': 2})
plotdf = plotdf.sort_values(['codec.type', 'codec.nameorder'])

plotdf

In [None]:
fig = px.scatter(
    plotdf,
    x='workflow.overall_rate',
    y='duration',
    color='codec.type',
    facet_col='codec.name',
    facet_row='clustering.type',
    facet_row_spacing=0.1,
    facet_col_spacing=0.05,
    symbol='decoding_success',
    color_discrete_map={'DNAAeon': '#3182bd', 'DNAFountain': '#31a354', 'DNARS': '#e6550d', 'HEDGES': '#756bb1', 'YinYang': '#636363', 'Goldman': '#de2d26'},
    symbol_map={True: 'circle', False: 'circle-open'},
)

fig.add_hline(y=60, line_dash="dot", line_color="black", line_width=2)
fig.update_xaxes(matches=None)
fig.for_each_xaxis(lambda xaxis: xaxis.update(range=[0, 0.2], minor_dtick=0.025))
fig.for_each_yaxis(lambda yaxis: yaxis.update(range=[0, 61], minor_dtick=10))
fig.update_xaxes(title='Error rate per nt', row=1)
fig.update_yaxes(title='Runtime / min', col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    width=670,
    height=300,
    margin=dict(l=0, r=10, t=20, b=10),
    showlegend=False,
)
fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.show()
fig.write_image(f'./figures/duration.svg')
fig.write_image(f'./figures/duration.png', scale=2)

In [None]:
fig = px.scatter(
    plotdf,
    x='workflow.overall_rate',
    y='memory_value',
    color='codec.type',
    facet_col='codec.name',
    facet_row='clustering.type',
    facet_row_spacing=0.1,
    facet_col_spacing=0.05,
    symbol='decoding_success',
    color_discrete_map={'DNAAeon': '#3182bd', 'DNAFountain': '#31a354', 'DNARS': '#e6550d', 'HEDGES': '#756bb1', 'YinYang': '#636363', 'Goldman': '#de2d26'},
    symbol_map={True: 'circle', False: 'circle-open'},
)

fig.add_hline(y=8, line_dash="dot", line_color="black", line_width=2)
fig.update_xaxes(matches=None)
fig.for_each_xaxis(lambda xaxis: xaxis.update(range=[0, 0.2], minor_dtick=0.025))
fig.for_each_yaxis(lambda yaxis: yaxis.update(range=[0, 8.1], minor_dtick=1))
fig.update_xaxes(title='Error rate per nt', row=1)
fig.update_yaxes(title='Memory use / GB', col=1)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_layout(
    width=670,
    height=300,
    margin=dict(l=0, r=10, t=20, b=10),
    showlegend=False,
)
fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)
fig.show()
fig.write_image(f'./figures/memory.svg')
fig.write_image(f'./figures/memory.png', scale=2)