In [1]:
import plotly.express as px
import plotly.graph_objects as pg
import numpy as np
import pandas as pd
import scipy.stats

# Dropout comparison

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

parent_folder = '../data/extreme_cases'

level_names = ['worst', 'best']

pool_names = {
    50: 'Pool_500',
    45: 'Pool_450',
    40: 'Pool_400',
    35: 'Pool_350',
    32.5: 'Pool_325',
    30: 'Pool_300',
    27.5: 'Pool_275',
    25: 'Pool_250',
    22.5: 'Pool_225',
    20: 'Pool_200',
    18: 'Pool_180',
    16: 'Pool_160',
    15: 'Pool_150',
    14: 'Pool_140',
    13: 'Pool_130',
    12: 'Pool_120',
    11: 'Pool_110',
    10: 'Pool_100',
    9: 'Pool_090',
    8: 'Pool_080',
    7: 'Pool_070',
    6: 'Pool_060',
    5: 'Pool_050',
    4: 'Pool_040',
    3: 'Pool_030',
    2.5: 'Pool_025',
    2: 'Pool_020',
    1.5: 'Pool_015',
    1: 'Pool_010',
    0.5: 'Pool_005',
}

inputs = {
    'errorfree': 'scafstats_pear_perfect.txt',
    'witherrors': 'scafstats.txt'
}

input_dict = {f'{cov}_{err}_{lvl}': f'{parent_folder}/{lvl}/{cov_folder}/{err_file}' for cov, cov_folder in pool_names.items() for lvl in level_names for err, err_file in inputs.items()}

data = analysis.DistributionAnalysis(input_dict)


Missing sequences: 1659, 13.83%
Missing sequences: 569, 4.74%
Missing sequences: 0, 0.00%
Missing sequences: 0, 0.00%
Missing sequences: 1830, 15.25%
Missing sequences: 656, 5.47%
Missing sequences: 0, 0.00%
Missing sequences: 0, 0.00%
Missing sequences: 1964, 16.37%
Missing sequences: 743, 6.19%
Missing sequences: 0, 0.00%
Missing sequences: 0, 0.00%
Missing sequences: 2194, 18.28%
Missing sequences: 871, 7.26%
Missing sequences: 0, 0.00%
Missing sequences: 0, 0.00%
Missing sequences: 2343, 19.53%
Missing sequences: 956, 7.97%
Missing sequences: 0, 0.00%
Missing sequences: 0, 0.00%
Missing sequences: 2519, 20.99%
Missing sequences: 1054, 8.78%
Missing sequences: 1, 0.01%
Missing sequences: 0, 0.00%
Missing sequences: 2664, 22.20%
Missing sequences: 1139, 9.49%
Missing sequences: 0, 0.00%
Missing sequences: 0, 0.00%
Missing sequences: 2879, 23.99%
Missing sequences: 1282, 10.68%
Missing sequences: 4, 0.03%
Missing sequences: 4, 0.03%
Missing sequences: 3114, 25.95%
Missing sequences: 1

Unnamed: 0_level_0,mean,total
exp,Unnamed: 1_level_1,Unnamed: 2_level_1
0.5_errorfree_best,128.870500,1546446.0
0.5_errorfree_worst,36.322000,435864.0
0.5_witherrors_best,333.333333,4000000.0
0.5_witherrors_worst,332.771833,3993262.0
1.5_errorfree_best,128.267500,1539210.0
...,...,...
8_witherrors_worst,332.558500,3990702.0
9_errorfree_best,128.410583,1540927.0
9_errorfree_worst,35.803833,429646.0
9_witherrors_best,333.333333,4000000.0


In [3]:
plot_data = data.data.copy()

all_results = {}

for i, exp in enumerate(plot_data.exp.unique()):
    idata = plot_data.loc[plot_data['exp'] == exp, ['#name', 'assignedReads']]
    all_results[exp] = 1-np.count_nonzero(idata['assignedReads'])/12000

In [4]:
plot_data = pd.DataFrame.from_dict(all_results, orient="index", columns=["dropout"]).reset_index().rename(columns={"index": "exp"})

for i, exp in enumerate(plot_data.exp.unique()):
    plot_data.loc[plot_data['exp'] == exp, 'cov'], plot_data.loc[plot_data['exp'] == exp, 'set'], plot_data.loc[plot_data['exp'] == exp, 'lvl'] = exp.split('_')
    plot_data.loc[plot_data['exp'] == exp, 'cov'] = pd.to_numeric(plot_data.loc[plot_data['exp'] == exp, 'cov'])

In [5]:
plot_data = pd.DataFrame.from_dict(all_results, orient="index", columns=["dropout"]).reset_index().rename(columns={"index": "exp"})

for i, exp in enumerate(plot_data.exp.unique()):
    plot_data.loc[plot_data['exp'] == exp, 'cov'], plot_data.loc[plot_data['exp'] == exp, 'set'], plot_data.loc[plot_data['exp'] == exp, 'lvl'] = exp.split('_')
plot_data.loc[:,'cov'] = pd.to_numeric(plot_data.loc[:,'cov'])

fig = pg.Figure()

colors = ["#bdbdbd", "#636363"]
colors_fill = ['#ececec', '#cccccc']

for j, lvl in enumerate(plot_data.lvl.unique()):
    for i, iset in enumerate(plot_data.set.unique()):
    
        idata = plot_data.loc[(plot_data['lvl'] == lvl) & (plot_data['set'] == iset), ['cov', 'dropout']]
        fig.add_trace(
            pg.Scatter(
                x=idata['cov'],
                y=idata.rolling(3, min_periods=1, center=True, on='cov').mean()['dropout'],
                line_dash=None if iset == 'witherrors' else 'dash',
                line_width=1.5,
                fill='tonexty' if i == 1 else None,
                # mode="markers+lines",
                # line_shape='spline',
                fillcolor=colors_fill[j],
                line_color=colors[j],
            )
        )
    
fig.add_trace(pg.Scatter(x=[6.2], y=[0.15], marker_symbol='diamond', marker_size=8, marker_color="#222222"))


fig.update_traces(marker=dict(size=7), selector=dict(mode='markers'))

fig.update_yaxes(title_text='Dropout', tickformat=".0%", dtick=0.2, range=[0, 0.4])
fig.update_layout(
    template="simple_white", 
    height=150, 
    width=330,
    showlegend=False, 
    margin=dict(l=0, r=10, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
    xaxis1 = pg.layout.XAxis( 
        range=[0, 40],
        tickmode='linear',
        dtick=10,
        title="Physical redundancy",
        title_font_family="Inter",
        title_font_size=28/3, 
        tickfont_size=28/3, 
        minor_ticks="outside",
        minor_dtick=5
    ),
    xaxis2 = pg.layout.XAxis( 
        overlaying='x',
        side='top',
        range=[0, 40],
        tickmode='array',
        tickvals=[2.5, 5, 10, 20, 30, 40],
        ticktext=['48', '24', '12', '6', '4', '3'],
        title="Approx. storage density / EB g<sup>-1</sup>",
        title_font_family="Inter",
        title_font_size=28/3, 
        tickfont_size=28/3
    ),
)

fig.update_xaxes(title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3)
fig.update_yaxes(title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3, minor_ticks="outside", minor_dtick=0.1)

# we need to plot something on the second x axis for it to show
fig.add_trace(pg.Scatter(x=[1], y=[1], xaxis='x2', yaxis='y'))

fig.show()
fig.write_image("extremes.svg")