In [None]:
%load_ext autoreload
%autoreload 2
import dt4dds_benchmark
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

data_sim = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./sim_data/{scenario}/{clustering}/{codec}.hdf5').get_data() for codec in (
    'aeon_high',
    'aeon_max',
    'aeon_medium',
    'fountain_high',
    'fountain_max',
    'fountain_medium',
    'goldman',
    'rs_high',
    'rs_max',
    'rs_medium',
    'hedges',
    'yinyang',
) for clustering in (
    'basic',
    'default',
) for scenario in (
    'bestcase', 
    'worstcase'
)])

data_exp = dt4dds_benchmark.analysis.Dataset.combine(*[dt4dds_benchmark.pipelines.HDF5Manager(f'./decoding_data/{scenario}/{codec}.hdf5').get_data() for codec in (
    'aeon_high',
    'aeon_max',
    'aeon_medium',
    'fountain_high',
    'fountain_max',
    'fountain_medium',
    'goldman',
    'rs_high',
    'rs_max',
    'rs_medium',
    'hedges',
    'yinyang',
) for scenario in (
    'bestcase', 
    'worstcase'
)])

In [None]:
exp2coderate = {
    'DNAAeon_max': "1.81", 'DNAAeon_high': "1.51", 'DNAAeon_medium': "1.01",
    'DNAFountain_max': "1.74", 'DNAFountain_high': "1.47", 'DNAFountain_medium': "1.00",
    'DNARS_max': "1.64", 'DNARS_high': "1.50", 'DNARS_medium': "1.00",
    'Goldman_default': "0.34", 'HEDGES_pool': "0.99", 'YinYang_default-pool': "1.82",
}

### prepare experimental data

In [None]:
df_exp = data_exp.combined_results.copy()
df_exp['cov'] = df_exp['metadata.coverage'].str[3:].astype(float)
df_exp['iteration'] = df_exp['metadata.iteration']
df_exp['scenario'] = df_exp['metadata.scenario']
df_exp = df_exp.groupby(['codec.type', 'codec.name', 'clustering.type', 'clustering.name', 'scenario', 'cov'])['decoding_success'].mean().reset_index()
df_exp = df_exp.loc[df_exp.groupby(['codec.type', 'codec.name', 'scenario', 'cov'])['decoding_success'].idxmax()]

df_exp

In [None]:
df_exp.loc[(df_exp['scenario'] == 'worstcase') & (df_exp['codec.type'] == 'DNAAeon')]

In [None]:
success_df = df_exp.loc[df_exp['decoding_success'] > 0.00, ['codec.type', 'codec.name', 'clustering.type', 'scenario', 'cov', 'decoding_success']]
success_df['codec.name'] = success_df['codec.type'] + '_' + success_df['codec.name'].str.replace('_pool', '')
success_df['coderate'] = success_df['codec.name'].map(exp2coderate)
success_df = success_df.drop(columns=['codec.name'])

success_df

### check simulation results

In [None]:
for c in data_sim.separate_by_parameters(['workflow.type', 'codec.type', 'codec.name', 'clustering.type']):
    c.fit('workflow.coverage').plot(title_columns=['workflow.type', 'codec.type', 'codec.name', 'clustering.type']).show()

### prepare simulated data

In [None]:
df_sim = data_sim.get_fits_by_group(['workflow.type', 'codec.type', 'codec.name', 'clustering.type'], 'workflow.coverage', additional_agg={'code_rate': 'mean'})

prop_df = []
for row in df_sim.iterrows():
    for cov in [2, 5, 10, 25, 50, 1000]:
        if (row[1]['message'] != 'Fit successful.'):
            p = 0
        else:
            p = dt4dds_benchmark.analysis.datafit.sigmoid(np.log10(cov), row[1]['params_k'], row[1]['params_x0'])
        d = row[1].copy()
        d['cov'] = cov
        d['prop'] = p
        prop_df.append(d)

prop_df = pd.DataFrame(prop_df)
prop_df = prop_df.groupby(['workflow.type', 'codec.type', 'codec.name', 'cov'])['prop'].max().reset_index()
prop_df['codec.name'] = prop_df['codec.type'] + '_' + prop_df['codec.name'].str.replace('_pool', '')
prop_df['coderate'] = prop_df['codec.name'].map(exp2coderate)
prop_df = prop_df.drop(columns=['codec.name'])
prop_df = prop_df.pivot(index=['workflow.type', 'codec.type', 'coderate'], columns='cov', values='prop').reset_index().sort_values(['workflow.type', 'codec.type', 'coderate'])

prop_df

In [None]:
for workflow, scenario, covs in (('Pool_Bestcase', 'bestcase', (2, 5, 10, 25, 1000)), ('Pool_Worstcase', 'worstcase', (5, 10, 25, 50, 1000))):

    iprop_df = prop_df.loc[prop_df['workflow.type'] == workflow].copy()
    isuccess_df = success_df.loc[success_df['scenario'] == scenario].copy()

    fig = go.Figure(
        data=go.Heatmap(
            z=np.array([iprop_df[s].values for s in covs]).T,
            x=[str(s) for s in covs],
            y=[iprop_df['codec.type'].values, iprop_df['coderate'].values],
            hoverongaps = False,
            colorbar=dict(thickness=10, len=0.5, tickfont=dict(size=28/3, family='Inter'), dtick=0.5, tickformat='.0%'),
            colorscale=[[0, '#ffffff'], [1, '#74c476']],
        )
    )

    fig.update_layout(
        width=300,
        height=250,
        margin=dict(l=10, r=0, t=5, b=0),
        showlegend=False,
    )
    fig.update_yaxes(
        autorange='reversed',
    )
    fig = dt4dds_benchmark.analysis.plotting.standardize_plot(fig)

    for row in isuccess_df.iterrows():
        type2row = {'DNAAeon': 0, 'DNAFountain': 3, 'DNARS': 6, 'Goldman': 9, 'HEDGES': 10, 'YinYang': 11}
        rate2row = {'0.34': 0, '0.99': 0, '1.00': 0, '1.01': 0, '1.50': 1, '1.51': 1, '1.47': 1}
        cov2col = {cov: i for i, cov in enumerate(covs)}
        x = cov2col[row[1]['cov']]
        y = type2row[row[1]['codec.type']] + rate2row[row[1]['coderate']]
        fig.add_annotation(
            text='✓',
            x=x,
            y=y,
            showarrow=False,
            font=dict(size=2*28/3, family='Inter', weight='bold', color='black'),
            opacity=row[1]['decoding_success'],
        )

    fig.write_image(f"./figures/exp_sim_table_{scenario}.svg")
    fig.write_image(f"./figures/exp_sim_table_{scenario}.png", scale=2)
    fig.show()