In [1]:
import numpy as np
from scipy.optimize import curve_fit
import scipy.stats
import plotly.graph_objects as pg
import plotly.express as px
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/15c_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/20c_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/25c_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/10c_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/15c_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/20c_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/PCR/25c_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/0a_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCall")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/0a_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCfix")),
    ('Genscript', analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/15c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/30c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/45c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/60c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/75c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/90c_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/15c_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/30c_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/45c_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/60c_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/75c_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/PCR/90c_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/0a_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCall")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/0a_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCfix")),
    ('Twist', analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCfix")),
])

In [3]:
def hex_to_rgb(hex_color: str) -> tuple:
    hex_color = hex_color.lstrip("#")
    return int(hex_color[0:2], 16), int(hex_color[2:4], 16), int(hex_color[4:6], 16)



plot_data = data.data[f"deletions_by_refposition"].copy()

plot_data['position_read'] = plot_data['position']

for exp in plot_data.exp.unique():
    plot_data.loc[(plot_data['exp'] == exp) & (plot_data['read'] == "1"), 'position'] = plot_data.loc[(plot_data['exp'] == exp) & (plot_data['read'] == "1"), 'position'].max() - plot_data.loc[(plot_data['exp'] == exp) & (plot_data['read'] == "1"), 'position']




colors = ['#6a51a3', '#9e9ac8']

fig = pg.Figure()
fig.update_layout(
    xaxis=dict(
        anchor='y',
        title_text='Position in synthesis direction', 
        dtick=20, 
        range=[0, 120],
        minor_ticks="outside", 
        minor_dtick=10,
        tickfont_size=28/3, 
    ),
    yaxis = dict(
        anchor='x',
        title_text='Deletion rate / 10<sup>-3</sup> nt<sup>-1</sup>',
        range=[0, 60],
        dtick=10, 
        minor_ticks="outside", 
        minor_dtick=5,
        tickfont_size=28/3, 
    ),
    xaxis2=dict(
        domain=[0.2, 0.75],
        anchor='y2',
        title_text="",
        range=[0, 110],
        dtick=40,
        minor_ticks="outside", 
        minor_dtick=20,
        tickfont_size=28/3, 
    ),
    yaxis2 = dict(
        domain=[0.6, 0.75],
        anchor='x2',
        title_text="",
        range=[0.25, 1.25],
        dtick=0.5,
        minor_ticks="outside", 
        minor_dtick=0.25,
        tickfont_size=28/3, 
    )
)




for i, group in enumerate(plot_data.group.unique()):
    idata = plot_data.loc[(plot_data['group'] == group)]

    maxdata = idata.groupby('position').agg({'rate': 'max'}).reset_index()
    fig.add_trace(
        pg.Scatter(
            x=maxdata['position'],
            y=1000*maxdata.rolling(3, min_periods=1, on="position", center=True)['rate'].mean(),
            line_width=0,
        )
    )
    mindata = idata.groupby('position').agg({'rate': 'min'}).reset_index()
    fig.add_trace(
        pg.Scatter(
            x=mindata['position'],
            y=1000*mindata.rolling(3, min_periods=1, on="position", center=True)['rate'].mean(),
            fill='tonexty',
            line_width=0,
            fillcolor=f"rgba{(*hex_to_rgb(colors[i]), 0.25)}"
        )
    )

    for j, read in enumerate(plot_data.read.unique()):

        idata = plot_data.loc[(plot_data['group'] == group) & (plot_data['read'] == read)]
        meandata = idata.groupby('position').agg({'rate': 'median'}).reset_index()

        fig.add_trace(
            pg.Scatter(
                x=meandata['position'],
                y=1000*meandata['rate'],
                line_color=colors[i],
                line_width=1.5,
                line_dash='dot' if read == '2' else None
            )

        )


    if group == 'Twist':

        idata = plot_data.loc[(plot_data['group'] == group)]

        maxdata = idata.groupby('position').agg({'rate': 'max'}).reset_index()
        fig.add_trace(
            pg.Scatter(
                x=maxdata['position'],
                y=1000*maxdata.rolling(3, min_periods=1, on="position", center=True)['rate'].mean(),
                line_width=0,
                xaxis = 'x2',
                yaxis = 'y2',
            )
        )
        mindata = idata.groupby('position').agg({'rate': 'min'}).reset_index()
        fig.add_trace(
            pg.Scatter(
                x=mindata['position'],
                y=1000*mindata.rolling(3, min_periods=1, on="position", center=True)['rate'].mean(),
                fill='tonexty',
                line_width=0,
                fillcolor=f"rgba{(*hex_to_rgb(colors[i]), 0.25)}",
                xaxis = 'x2',
                yaxis = 'y2',
            )
        )

        for j, read in enumerate(plot_data.read.unique()):

            idata = plot_data.loc[(plot_data['group'] == group) & (plot_data['read'] == read)]
            meandata = idata.groupby('position').agg({'rate': 'median'}).reset_index()

            fig.add_trace(
                pg.Scatter(
                    x=meandata['position'],
                    y=1000*meandata['rate'],
                    line_color=colors[i],
                    line_width=1.5,
                    line_dash='dot' if read == '2' else None,
                    xaxis = 'x2',
                    yaxis = 'y2',
                )

            )





fig.add_shape(type="rect",
    x0=7, y0=8, x1=18, y1=35,
    fillcolor="#999999",
    layer='below'
)

fig.add_annotation(
    x=27,
    y=25,
    text=f"index<br>region",
    align='left',
    showarrow=False,
    font_color='#999999',
    font_family="Inter", 
    font_size=28/3,
)

mean = plot_data.loc[plot_data.group == "Genscript"].copy().groupby('position')['rate'].mean().mean()*1000
fig.add_hline(y=mean, line_width=1, line_dash='dash', line_color=colors[0], opacity=1)
fig.add_annotation(
    x=70,
    y=mean+2,
    text=f"mean: {mean:.1f} · 10<sup>-3</sup> nt<sup>-1</sup>",
    showarrow=False,
    font_color=colors[0],
    font_family="Inter", 
    font_size=28/3,
)

mean = plot_data.loc[plot_data.group == "Twist"].copy().groupby('position')['rate'].mean().mean()*1000
fig.add_annotation(
    x=90,
    y=3,
    text=f"mean: {mean:.1f} · 10<sup>-3</sup> nt<sup>-1</sup>",
    showarrow=False,
    font_color=colors[1],
    font_family="Inter", 
    font_size=28/3,
)

fig.update_traces(
    marker=dict(), 
    selector=dict(mode='markers'),
)

fig.update_layout(
    template="simple_white", 
    height=330, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_font_family="Inter", 
    title_font_size=28/3, 
)
fig.update_yaxes(
    title_font_family="Inter", 
    title_font_size=28/3, 
)
fig.show()
fig.write_image("error_rates.svg")

In [4]:
for group in plot_data.group.unique():

    meandata = plot_data.loc[plot_data.group == group].copy()

    position_group = meandata.groupby('position')['rate'].mean()
    print(f"{group}: {position_group.mean()}")
    rel_rate = position_group/position_group.mean()
    pd.DataFrame(rel_rate).to_csv(f"del_rate_{group}.csv", index=False)

Genscript: 0.015602686673473266
Twist: 0.0005695676211259752


In [5]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.DistributionAnalysis({
    'Genscript_all': '../data/PCR/15c_Genscript_GCall/scafstats.txt',
    'Twist_all': '../data/PCR/15c_Twist_GCall/scafstats.txt',
    'Genscript_fixed': '../data/PCR/15c_Genscript_GCfix/scafstats.txt',
    'Twist_fixed': '../data/PCR/15c_Twist_GCfix/scafstats.txt'
})

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Missing sequences: 30, 0.24%
Missing sequences: 1, 0.01%
Missing sequences: 0, 0.00%
Missing sequences: 3, 0.03%


Unnamed: 0_level_0,mean,total
exp,Unnamed: 1_level_1,Unnamed: 2_level_1
Genscript_all,269.86594,3365768.0
Genscript_fixed,202.340429,2509426.0
Twist_all,164.276333,1971316.0
Twist_fixed,133.779167,1605350.0


In [6]:
plot_data = data.data.copy()

for i, exp in enumerate(plot_data.exp.unique()):
    plot_data.loc[plot_data['exp'] == exp, 'group'], plot_data.loc[plot_data['exp'] == exp, 'set'] = exp.split('_')

fig = px.histogram(
    plot_data, 
    x="x", 
    color="group", 
    barmode='overlay',
    facet_row="set",
    color_discrete_sequence=['#6a51a3', '#cbc9e2'],
    range_x=[0, 3], 
    range_y=[0, 2], 
    opacity=0.65,
    nbins=1000,
    facet_row_spacing=0.05,
    histnorm='probability density'
)
fig.update_traces(xbins={'start': 0.0, 'end': 3, 'size': 0.1}, selector=dict(type='histogram'))
fig.for_each_annotation(lambda a: a.update(text=''))

colors = ['#4a1486', '#6a51a3']
for j, iset in enumerate(plot_data.set.unique()):
    for i, group in enumerate(plot_data.group.unique()):
        exp = f"{group}_{iset}"
        idata = plot_data.loc[plot_data['exp'] == exp, 'x'].copy().dropna()
        idata = idata[idata != 0.0]
        mean, std = np.log(idata).mean(), np.log(idata).std()
        x = np.linspace(0.01, plot_data.loc[plot_data['exp'] == exp, 'x'].max(), 400)
        fig.add_trace(
            pg.Scatter(
                x=x,
                y=scipy.stats.lognorm.pdf(x, scale=np.exp(mean), s=std),
                mode="lines",
                showlegend=False,
                line=dict(color=colors[i], width=2)
            ), 
            row=2-j,
            col=1,
        )
        fig.add_annotation(x=2.5 if i==0 else 1.6, y=0.35 if i==0 else 1,
            text=f"σ = {std:.2f}",
            showarrow=False,
            yshift=0, 
            row=2-j,
            col=1,
            font=dict(color=colors[i], family="Inter", size=28/3)
        )

fig.for_each_yaxis(lambda yaxis: yaxis.update(dtick=1))
fig.update_traces(marker=dict(line_width=0), selector=dict(type='histogram')) 

fig.update_layout(
    template="simple_white", 
    height=330, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_text='',
    title_font_family="Inter", 
    title_font_size=28/3, 
    dtick=1,
    minor_ticks="outside", 
    minor_dtick=0.5,
    tickfont_size=28/3, 
)
fig.update_xaxes(
    title_text='Normalized coverage', row=1, col=1
)
fig.update_yaxes(
    title_text='Probability density',
    title_font_family="Inter", 
    title_font_size=28/3, 
    dtick=1,
    minor_ticks="outside", 
    minor_dtick=0.5,
    tickfont_size=28/3, 
)

fig.show()
fig.write_image("coverage.svg")