In [1]:
import plotly.express as px
import plotly.graph_objects as pg
import numpy as np
import pandas as pd
import scipy.stats

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis


data = analysis.GroupAnalysis([
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_M")),
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_P")),
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_F1")),
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_F2")),
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_F3")),
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_F4")),
    ('exp', analysis.ErrorAnalysis("../data/DoT/Bunny_F5")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_M")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_P")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_F1")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_F2")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_F3")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_F4")),
    ('sim', analysis.ErrorAnalysis("../data/bunny_generations/Bunny_F5")),
])

In [3]:
plot_data = data.data['overall_error_rates'].groupby(['type', 'exp', 'group'])['rate'].mean().reset_index()
plot_data.drop(plot_data.loc[(plot_data.type != 'deletions') & (plot_data.type != 'substitutions')].index, inplace=True)

exp_order = ["Bunny_M", "Bunny_P", "Bunny_F1", "Bunny_F2", "Bunny_F3", "Bunny_F4", "Bunny_F5"]

colors = ["#bdbdbd", "#636363"]

fig = pg.Figure()
bar_width = 0.125

for i, errortype in enumerate(plot_data.type.unique()):

    for j, group in enumerate(plot_data.group.unique()):

        this_data = plot_data.loc[(plot_data.group == group) & (plot_data.type == errortype)].set_index('exp')
        error_data = np.array([this_data.loc[iexp, 'rate'] for iexp in exp_order])

        fig.add_trace(
            pg.Bar(
                x=0.5 + 2.5*i + j + bar_width*np.array([-3, -2, -1, 0, 1, 2, 3]), 
                y=error_data*1000, 
                width=bar_width,
                marker_color=colors[j]
            )
        )


fig.update_xaxes(
    range=[-0.05, 4.45],
    tickmode = 'array',
    tickvals = [1, 3.5],
    ticktext = ['Deletions', 'Substitutions'],
    title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3
)
fig.update_yaxes(
    range=[0, 20],
    title="Error rate / 10<sup>-3</sup> nt<sup>-1</sup>",
    title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3, minor_ticks="outside", minor_dtick=2.5
)
fig.update_traces(marker=dict(line_width=0.5), selector=dict(type='bar')) 
fig.update_layout(
    template="simple_white", 
    height=150, 
    width=330,
    showlegend=False, 
    margin=dict(l=47, r=10, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.show()
fig.write_image("comparison_errorrates.svg")

# Dropout comparison

In [4]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.DistributionAnalysis({
    'BunnyM_exp': '../data/DoT/Bunny_M/scafstats_pear_perfect.txt',
    'BunnyP_exp': '../data/DoT/Bunny_P/scafstats_pear_perfect.txt',
    'BunnyF1_exp': '../data/DoT/Bunny_F1/scafstats_pear_perfect.txt',
    'BunnyF2_exp': '../data/DoT/Bunny_F2/scafstats_pear_perfect.txt',
    'BunnyF3_exp': '../data/DoT/Bunny_F3/scafstats_pear_perfect.txt',
    'BunnyF4_exp': '../data/DoT/Bunny_F4/scafstats_pear_perfect.txt',
    'BunnyF5_exp': '../data/DoT/Bunny_F5/scafstats_pear_perfect.txt',
    'BunnyM_sim': '../data/bunny_generations/Bunny_M/scafstats_pear_perfect.txt',
    'BunnyP_sim': '../data/bunny_generations/Bunny_P/scafstats_pear_perfect.txt',
    'BunnyF1_sim': '../data/bunny_generations/Bunny_F1/scafstats_pear_perfect.txt',
    'BunnyF2_sim': '../data/bunny_generations/Bunny_F2/scafstats_pear_perfect.txt',
    'BunnyF3_sim': '../data/bunny_generations/Bunny_F3/scafstats_pear_perfect.txt',
    'BunnyF4_sim': '../data/bunny_generations/Bunny_F4/scafstats_pear_perfect.txt',
    'BunnyF5_sim': '../data/bunny_generations/Bunny_F5/scafstats_pear_perfect.txt',
    'BunnyF6_sim': '../data/bunny_generations/Bunny_F6/scafstats_pear_perfect.txt',
    'BunnyF7_sim': '../data/bunny_generations/Bunny_F7/scafstats_pear_perfect.txt',
    'BunnyF8_sim': '../data/bunny_generations/Bunny_F8/scafstats_pear_perfect.txt',
    'BunnyF9_sim': '../data/bunny_generations/Bunny_F9/scafstats_pear_perfect.txt',
})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Missing sequences: 706, 5.88%
Missing sequences: 1192, 9.93%
Missing sequences: 1921, 16.01%
Missing sequences: 1685, 14.04%
Missing sequences: 2266, 18.88%
Missing sequences: 3070, 25.58%
Missing sequences: 2440, 20.33%
Missing sequences: 404, 3.37%
Missing sequences: 627, 5.22%
Missing sequences: 938, 7.82%
Missing sequences: 1294, 10.78%
Missing sequences: 1764, 14.70%
Missing sequences: 2308, 19.23%
Missing sequences: 2763, 23.03%
Missing sequences: 3390, 28.25%
Missing sequences: 3915, 32.62%
Missing sequences: 4424, 36.87%
Missing sequences: 4948, 41.23%


Unnamed: 0_level_0,mean,total
exp,Unnamed: 1_level_1,Unnamed: 2_level_1
BunnyF1_exp,8.627583,103531.0
BunnyF1_sim,27.683417,332201.0
BunnyF2_exp,9.017667,108212.0
BunnyF2_sim,24.313417,291761.0
BunnyF3_exp,8.575167,102902.0
BunnyF3_sim,21.283417,255401.0
BunnyF4_exp,7.370583,88447.0
BunnyF4_sim,18.53275,222393.0
BunnyF5_exp,26.273583,315283.0
BunnyF5_sim,16.133333,193600.0


In [5]:
plot_data = data.data.copy()
for i, exp in enumerate(plot_data.exp.unique()):
    plot_data.loc[plot_data['exp'] == exp, 'group'], plot_data.loc[plot_data['exp'] == exp, 'set'] = exp.split('_')
    plot_data.loc[plot_data['exp'] == exp, 'gen'] = plot_data.loc[plot_data['exp'] == exp, 'group'].str.replace('Bunny', '')

N_SAMPLES = 5
exp_reads_count = []
all_results = {}

for j, gen in enumerate(["M", "P", "F1", "F2", "F3", "F4", "F5"]):
    exp_data = plot_data.loc[(plot_data['gen'] == gen) & (plot_data['set'] == 'exp'), ['#name', 'assignedReads']]
    exp_reads = int(exp_data.assignedReads.sum())
    exp_reads_count.append(exp_reads)
    idata = plot_data.loc[(plot_data['gen'] == gen) & (plot_data['set'] == 'sim'), ['#name', 'assignedReads']]
    results = []
    for i in range(N_SAMPLES):
        unique_seqs = len(idata.sample(exp_reads, weights=idata.assignedReads, replace=True)['#name'].unique())
        results.append(1-unique_seqs/12000)
    all_results[f"{j}_sim"] = np.mean(results)
    all_results[f"{j}_exp"] = 1-np.count_nonzero(exp_data['assignedReads'])/12000

for j, gen in enumerate(["F6", "F7", "F8", "F9"]):
    idata = plot_data.loc[(plot_data['gen'] == gen) & (plot_data['set'] == 'sim'), ['#name', 'assignedReads']]
    results = []
    for i in range(N_SAMPLES):
        unique_seqs = len(idata.sample(int(np.mean(exp_reads_count)), weights=idata.assignedReads, replace=True)['#name'].unique())
        results.append(1-unique_seqs/12000)
    all_results[f"{j+7}_sim"] = np.mean(results)

In [6]:
plot_data = pd.DataFrame.from_dict(all_results, orient="index", columns=["dropout"]).reset_index().rename(columns={"index": "exp"})

for i, exp in enumerate(plot_data.exp.unique()):
    plot_data.loc[plot_data['exp'] == exp, 'gen'], plot_data.loc[plot_data['exp'] == exp, 'set'] = exp.split('_')
    plot_data.loc[plot_data['exp'] == exp, 'gen'] = pd.to_numeric(plot_data.loc[plot_data['exp'] == exp, 'gen'])

for i, iset in enumerate(plot_data.set.unique()):
    plot_data.loc[plot_data['set'] == iset, 'gen'] += 0.075 if iset == 'sim' else -0.075

fig = px.scatter(
    plot_data,
    x="gen",
    y="dropout",
    color="set",
    color_discrete_sequence=["#636363", "#bdbdbd"],
    range_y=[0, 0.6],
)

fig.update_xaxes(title_text='Generation')
fig.update_yaxes(title_text='Dropout', tickformat=".0%", title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3,)
fig.update_layout(
    template="simple_white", 
    height=150, 
    width=330,
    showlegend=False, 
    margin=dict(l=47, r=10, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
    xaxis1 = pg.layout.XAxis( 
        range=[-0.4, 10.2],
        tickmode='array',
        tickvals=np.arange(0, 11),
        ticktext=['M', 'P', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9'],
        title="Generation",
        title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3,
    ),
    xaxis2 = pg.layout.XAxis( 
        overlaying='x',
        side='top',
        range=[-0.4, 10.2],
        tickmode='array',
        tickvals=np.arange(0, 11),
        ticktext=['44', '59', '71', '83', '95', '107', '119', '131', '143', '155', '167'],
        title="Total PCR cycles",
        title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3,
    ),
)

# we need to plot something on the second x axis for it to show
fig.add_trace(pg.Scatter(x=[1], y=[1], xaxis='x2', yaxis='y'))

fig.add_vline(
    x=6.5,
    line_dash="dash",
    line_width=1.5,
    opacity=1,
    line_color="#969696"
)

fig.update_traces(marker=dict(size=7), selector=dict(mode='markers'))


fig.show()
fig.write_image("comparison_dropout.svg")