In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import plotly.express as px
import plotly.graph_objects as pg

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/15c_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/20c_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/25c_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/0a_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/15c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/30c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/45c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/60c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/75c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/90c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/0a_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCall")),
])

# Consecutive errors

In [3]:
from plotly.subplots import make_subplots



colors = ["#636363", "#bdbdbd"]
colors_marker = ["#222222", "#222222"]
MAX_LENGTH = 4


def length_plots(data, errortype):

    fig = make_subplots(
        rows=2, 
        shared_xaxes=True, 
        vertical_spacing=0.05,
        row_heights=[0.3, 0.7],
    )

    idata = data.data[f"error_frequency_by_length"].copy()
    idata.drop(idata.loc[idata['type'] != errortype].index, inplace=True)
    idata.length.clip(upper=MAX_LENGTH, inplace=True)

    newframe = idata.loc[idata.length == MAX_LENGTH].groupby(['type', 'exp', 'read', 'group'])['value'].sum().reset_index()
    newframe['length'] = MAX_LENGTH

    idata.drop(idata.loc[idata['length'] == MAX_LENGTH].index, inplace=True)
    idata = pd.concat([idata, newframe], ignore_index=True)

    def summary(group):
        d = {}
        d['mean'] = group.value.mean()
        d['std'] = group.value.std()
        return pd.Series(d, index=['mean', 'std'])

    df_aggregate = idata.groupby(['group', 'length'], as_index=False).apply(summary)


    length = np.arange(1, max(df_aggregate.length)+1)
    mean_rates = data.data[f"overall_error_rates"].loc[data.data[f"overall_error_rates"].type == errortype].groupby('group').agg(mean=('rate', 'mean'), std=('rate', 'std'))



    for i, group in enumerate(df_aggregate.group.unique()):

        this_data = df_aggregate.loc[df_aggregate.group == group]

        fig.add_trace(
            pg.Bar(
                x=this_data['length'],
                y=this_data['mean'],
                error_y=dict(
                    type='data',
                    array=this_data['std'],
                    color='#222222',
                    visible=True,
                    thickness=1.5,
                ),
                marker_color=colors[i]
            ),
            col=1,
            row=1
        )
        fig.add_trace(
            pg.Bar(
                x=this_data['length'],
                y=this_data['mean'],
                error_y=dict(
                    type='data',
                    array=this_data['std'],
                    color='#222222',
                    visible=True,
                    thickness=1.5,
                ),
                marker_color=colors[i]
            ),
            col=1,
            row=2
        )

        ratios = scipy.stats.geom.pmf(length, 1-mean_rates.loc[group, 'mean'])
        ratios[2] = 1-np.sum(ratios[0:-1])

        fig.add_trace(
            pg.Scatter(
                x=length-0.4/2+i*0.4,
                y=ratios,
                mode="markers",
                marker_color=colors_marker[i],
                marker_line_color=colors_marker[i],
                marker_line_width=2, 
                marker_size=8,
                marker_symbol="diamond-wide",
            ),
            col=1,
            row=1
        )
        fig.add_trace(
            pg.Scatter(
                x=length-0.4/2+i*0.4,
                y=ratios,
                mode="markers",
                marker_color=colors_marker[i],
                marker_line_color=colors_marker[i],
                marker_line_width=2, 
                marker_size=8,
                marker_symbol="diamond-wide",
            ),
            col=1,
            row=2
        )


    fig.update_layout(
        template='simple_white',
        height=175,
        width=245,
        showlegend=False,
        barmode='group',
        margin=dict(l=50, r=0, t=5, b=0),
        font_family="Inter",
        legend_font_size=28/3,
    )

    ticktext = list(map(str, range(1, MAX_LENGTH+1)))
    ticktext[-1] += '+'

    fig.update_xaxes(
        showticklabels=False, 
        visible=False, 
        range=[0.5, MAX_LENGTH+0.5],
        tickmode = 'array',
        tickvals = list(range(1, MAX_LENGTH+1)),
        ticktext = ticktext,
    )
    fig.update_xaxes(showticklabels=True, visible=True, row=2, col=1)
    fig.update_xaxes(showticklabels=True, visible=True, row=2, col=2)

    fig.update_yaxes(tickformat=",.0%")
    fig.update_yaxes(range=[0, 0.1], row=2, dtick=0.05)
    fig.update_yaxes(range=[0.725, 1.03], row=1, dtick=0.2)

    return fig

In [4]:
fig = length_plots(data, 'deletions')
fig.update_yaxes(title='Frequency', range=[0, 0.105], row=2, dtick=0.03, title_font_size=28/3, tickfont_size=28/3)
fig.update_yaxes(range=[0.725, 1.03], row=1, dtick=0.2, title_font_size=28/3, tickfont_size=28/3)
fig.update_xaxes(title='Length of consecutive deletions', title_font_size=28/3, tickfont_size=28/3)
fig.show()
fig.write_image("length_dels.svg")

In [5]:
idata = data.data[f"error_frequency_by_length"].copy()

for group in idata.group.unique():
    legnthmean = idata.drop(idata.loc[(idata['type'] != 'deletions') | (idata['group'] != group)].index)
    lengthmean = legnthmean.groupby('length').agg({'value': 'mean'}).reset_index()
    display(group, lengthmean)
    lengthmean['value'].to_csv(f"del_by_length_{group}.csv", index=False)
    lengthmean.drop(lengthmean.loc[lengthmean.length == 1].index, inplace=True)
    lengthmean.value /= lengthmean.value.sum()
    display(np.sum(lengthmean['value']*lengthmean['length']))

'Genscript'

Unnamed: 0,length,value
0,1,0.904807
1,2,0.073675
2,3,0.01174
3,4,0.003537
4,5,0.001672
5,6,0.001108
6,7,0.000857
7,8,0.000693
8,9,0.000548
9,10,0.000409


2.5637905128398475

'Twist'

Unnamed: 0,length,value
0,1,0.860229
1,2,0.061242
2,3,0.017815
3,4,0.011053
4,5,0.008265
5,6,0.007185
6,7,0.006237
7,8,0.005374
8,9,0.004803
9,10,0.004144


4.660044415723572

In [6]:
fig = length_plots(data, 'substitutions')
fig.update_yaxes(title='Frequency', range=[0, 0.014], row=2, dtick=0.01, tickformat=",.0%", title_font_size=28/3, tickfont_size=28/3)
fig.update_yaxes(range=[0.93, 1], row=1, dtick=0.05, title_font_size=28/3, tickfont_size=28/3)
fig.update_xaxes(title='Length of consecutive substitutions', title_font_size=28/3, tickfont_size=28/3)
fig.update_layout(margin=dict(l=50, r=0, t=8, b=0))
fig.show()
fig.write_image("length_subs.svg")

# Errors per read

In [7]:
n_bases = {
    'Genscript': 102,
    'Twist': 108,
}
MAX_ERRORS = 3

def per_read(data, errortype, n_bases):

    idata = data.data[f"error_frequency_by_read"].copy()

    idata.drop(idata.loc[idata['type'] != errortype].index, inplace=True)
    idata.frequency.clip(upper=MAX_ERRORS, inplace=True)

    newframe = idata.loc[idata.frequency == MAX_ERRORS].groupby(['type', 'exp', 'read', 'group'])['value'].sum().reset_index()
    newframe['frequency'] = MAX_ERRORS

    idata.drop(idata.loc[idata['frequency'] == MAX_ERRORS].index, inplace=True)
    idata = pd.concat([idata, newframe], ignore_index=True)

    def summary(group):
        d = {}
        d['mean'] = group.value.mean()
        d['std'] = group.value.std()
        return pd.Series(d, index=['mean', 'std'])

    df_aggregate = idata.groupby(['group', 'frequency'], as_index=False).apply(summary)


    frequency = np.arange(0, max(df_aggregate.frequency)+1)
    mean_rates = data.data[f"overall_error_rates"].loc[data.data[f"overall_error_rates"].type == errortype].groupby('group').agg(mean=('rate', 'mean'), std=('rate', 'std'))

    fig = pg.Figure()

    for i, group in enumerate(df_aggregate.group.unique()):

        this_data = df_aggregate.loc[df_aggregate.group == group]

        fig.add_trace(
            pg.Bar(
                x=this_data['frequency'],
                y=this_data['mean'],
                error_y=dict(
                    type='data',
                    array=this_data['std'],
                    color='#222222',
                    visible=True,
                    thickness=1.5,
                ),
                marker_color=colors[i]
            )
        )

        ratios = scipy.stats.binom.pmf(frequency, n_bases[group], mean_rates.loc[group, 'mean'])
        ratios[3] = 1-np.sum(ratios[0:-1])

        fig.add_trace(
            pg.Scatter(
                x=frequency-0.4/2+i*0.4,
                y=ratios,
                mode="markers",
                marker_color=colors_marker[i],
                marker_line_color=colors_marker[i],
                marker_line_width=2, 
                marker_size=8,
                marker_symbol="diamond-wide",
            ),
        )
        fig.add_trace(
            pg.Scatter(
                x=frequency-0.4/2+i*0.4,
                y=ratios,
                mode="markers",
                marker_color=colors_marker[i],
                marker_line_color=colors_marker[i],
                marker_line_width=2, 
                marker_size=8,
                marker_symbol="diamond-wide",
            ),
        )


    fig.update_layout(
        template='simple_white',
        height=175,
        width=245,
        showlegend=False,
        barmode='group',
        margin=dict(l=50, r=0, t=5, b=0),
        font_family="Inter",
        legend_font_size=28/3,
    )

    ticktext = list(map(str, range(0, MAX_ERRORS+1)))
    ticktext[-1] += '+'

    fig.update_xaxes(
        range=[-0.5, MAX_ERRORS+0.5],
        tickmode = 'array',
        tickvals = list(range(0, MAX_ERRORS+1)),
        ticktext = ticktext,
    )

    fig.update_yaxes(
        title="Frequency", 
        tickformat=",.0%", 
        range=[0, 0.5], 
        dtick=0.25,
        title_font_family="Inter",
        title_font_size=28/3, 
        tickfont_size=28/3
    )

    return fig

In [8]:
fig = per_read(data, 'delevents', n_bases)
fig.update_yaxes(  
    range=[0, 0.6], 
    dtick=0.2
)
fig.update_xaxes(title='Deletion events per read', title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3)
fig.show()
fig.write_image("read_dels.svg")

In [9]:
idata = data.data[f"error_frequency_by_read"].copy()

for group in idata.group.unique():
    mean_data = idata.drop(idata.loc[(idata['type'] != 'delevents') | (idata['group'] != group)].index)
    mean_data = pd.DataFrame(mean_data.groupby('frequency')['value'].mean())
    display(group, mean_data)
    mean_data.to_csv(f"del_per_read_{group}.csv", index=False)

'Genscript'

Unnamed: 0_level_0,value
frequency,Unnamed: 1_level_1
0,0.519202
1,0.244987
2,0.107337
3,0.052245
4,0.028378
5,0.016943
6,0.010816
7,0.007342
8,0.005052
9,0.003438


'Twist'

Unnamed: 0_level_0,value
frequency,Unnamed: 1_level_1
0,0.9604
1,0.038426
2,0.001021
3,9.4e-05
4,3.6e-05
5,1.5e-05
6,5e-06
7,3e-06
8,1e-06
9,2e-06


In [10]:
fig = per_read(data, 'substitutions', n_bases)
fig.update_yaxes(  
    range=[0, 0.6], 
    dtick=0.2
)
fig.update_xaxes(title='Substitution errors per read', title_font_family="Inter", title_font_size=28/3, tickfont_size=28/3)
fig.show()
fig.write_image("read_subs.svg")