In [1]:
import numpy as np
from scipy.optimize import curve_fit
import plotly.graph_objects as pg
import plotly.express as px
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript_GCfix', analysis.ErrorAnalysis("../data/phix/Genscript_GCfix/Genscript_GCfix_aging")),
    ('Genscript_GCfix', analysis.ErrorAnalysis("../data/phix/Genscript_GCfix/Genscript_GCfix_PCR")),
    ('Genscript_GCall', analysis.ErrorAnalysis("../data/phix/Genscript_GCall/Genscript_GCall_aging", skip=[(84, 86), (12, 16)])),
    ('Genscript_GCall', analysis.ErrorAnalysis("../data/phix/Genscript_GCall/Genscript_GCall_PCR", skip=[(84, 86), (12, 16)])),
    ('Twist_GCall', analysis.ErrorAnalysis("../data/phix/Twist/Twist_GCall_PCR")),
    ('Twist_GCfix', analysis.ErrorAnalysis("../data/phix/Twist/Twist_GCfix_PCR")),
    ('Twist_GCall', analysis.ErrorAnalysis("../data/phix/Twist/Twist_GCall_aging")),
    ('Twist_GCfix', analysis.ErrorAnalysis("../data/phix/Twist/Twist_GCfix_aging")),
])

In [3]:
plot_data = data.data[f"substitutions_by_position"]
plot_data.sort_values(by="position")

fit_data = []
for read in plot_data['read'].unique():
    window = plot_data.loc[(plot_data['read'] == read)]
    medians = window.groupby('position', as_index=False)['rate'].median()
    rolled_medians = medians.rolling(10, min_periods=1, center=True, on="position").mean()
    fit_data.append(rolled_medians)

fig = pg.Figure()

colors = ['#2171b5', '#bdd7e7']
for i, iread in enumerate(plot_data.read.unique()):
    read_data = plot_data.loc[plot_data.read == iread]
    fig.add_trace(
        pg.Scatter(
            x=read_data['position'],
            y=read_data['rate']*1000,
            mode="markers",
            marker=dict(color=colors[i])
        )
    )


colors = ['#222222', '#222222']
for i, read_data in enumerate(fit_data):
    fig.add_trace(
        pg.Scatter(
            x=read_data.index,
            y=read_data['rate']*1000,
            mode="lines",
            line=dict(color=colors[i], width=1.5)
        )
    )

fig.update_layout(
    template="simple_white", 
    height=330, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_text='Position in read', 
    range=[0, 120], 
    dtick=20,
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=10,
)
fig.update_yaxes(
    title_text='Substitution rate / 10<sup>-3</sup> nt<sup>-1</sup>', 
    range=[0, 5],
    dtick=1,
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=0.5,
)
fig.show()
fig.write_image("error_rate.svg")

In [4]:
for i, idata in enumerate(fit_data):
    display(idata.mean(), idata.std())
    (idata/idata.mean()).to_csv(f'rel_rate_{i+1}.csv', index=False)
display(np.concatenate(fit_data).mean(), np.concatenate(fit_data).std())

rate    0.001123
dtype: float64

rate    0.000326
dtype: float64

rate    0.0025
dtype: float64

rate    0.000583
dtype: float64

0.0018117509455185803

0.0008337470008476963

In [5]:
plot_data = data.data[f"substitutions_by_type"]

def wavg(group):
    d = {}
    d['mean'] = group.ratio.mean()
    d['std'] = group.ratio.std()
    return pd.Series(d, index=['mean', 'std'])

df_aggregate = plot_data.groupby(['type', 'read'], as_index=False).apply(wavg)

fig = px.bar(
    df_aggregate, 
    x="type", 
    y="mean", 
    color="read",
    error_y="std",
    barmode='group',
    color_discrete_sequence=['#2171b5', '#bdd7e7']
)
for d in fig.data:
    d.error_y.color = "#222222"
    d.error_y.thickness = 1.5
    
fig.update_layout(
    template="simple_white", 
    height=330, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_text='Substitution pattern',
    categoryorder='array', 
    categoryarray= ['A2T', 'G2A', 'C2T', 'G2C', 'T2A', 'A2G', 'T2C', 'C2G', 'G2T', 'T2G', 'C2A', 'A2C'],
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
)
fig.update_yaxes(
    title_text='Fraction of substitutions', 
    tickformat=",.0%", 
    range=[0, 0.5],
    dtick=0.1,
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=0.05,
)
fig.show()
fig.write_image("bias.svg")

In [6]:
df_aggregate

Unnamed: 0,type,read,mean,std
0,A2C,1,0.002901,0.000814
1,A2C,2,0.0027,0.000616
2,A2G,1,0.206548,0.023203
3,A2G,2,0.114736,0.013573
4,A2T,1,0.168426,0.050672
5,A2T,2,0.369133,0.046153
6,C2A,1,0.024604,0.009701
7,C2A,2,0.017441,0.008242
8,C2G,1,0.0139,0.003477
9,C2G,2,0.007984,0.002156
