In [1]:
import numpy as np
import plotly.graph_objects as pg
import plotly.express as px
import pandas as pd
import statsmodels.formula.api as smf

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.SeriesAnalysis([
    ('GCall', 15, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR0")),
    ('GCall', 30, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR1")),
    ('GCall', 45, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR2")),
    ('GCall', 60, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR3")),
    ('GCall', 75, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR4")),
    ('GCall', 90, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR5")),
    ('GCfix', 15, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR0")),
    ('GCfix', 30, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR1")),
    ('GCfix', 45, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR2")),
    ('GCfix', 60, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR3")),
    ('GCfix', 75, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR4")),
    ('GCfix', 90, analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR5")),
])

In [3]:
error_categories = {
    'A2C': "A2C/T2G", 
    'A2G': "A2G/T2C", 
    'A2T': "A2T/T2A", 
    'C2A': "C2A/G2T", 
    'C2G': "C2G/G2C", 
    'C2T': "C2T/G2A", 
    'T2G': "A2C/T2G", 
    'T2C': "A2G/T2C", 
    'T2A': "A2T/T2A", 
    'G2T': "C2A/G2T", 
    'G2C': "C2G/G2C", 
    'G2A': "C2T/G2A"
}
categories = ["A2C/T2G", "A2G/T2C", "A2T/T2A", "C2A/G2T", "C2G/G2C", "C2T/G2A"]
colors = ['#cccccc', '#969696', '#525252', '#fdbe85', '#fd8d3c', '#d94701']
cat_color_map = {
    "C2G/G2C": '#cccccc', 
    "A2C/T2G": '#969696', 
    "C2A/G2T": '#525252', 
    "A2T/T2A": '#fdbe85', 
    "C2T/G2A": '#fd8d3c', 
    "A2G/T2C": '#d94701'
}

df = data.data[f"substitutions_by_type"].copy()

# find intercept to normalize initial rates for each experiment group
df['delta_rate'] = df['rate']
for group in df['group'].unique():
    for errortype in df['type'].unique():
        for read in df['read'].unique():
            idata = df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read)]
            fit = smf.ols(formula='rate ~ delta_series_var', data=idata).fit()
            df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read), 'delta_rate'] -= fit.params['Intercept']

# combine analogous types into categories
df['category'] = df['type']
for error, category in error_categories.items():
    df.loc[(df['type'] == error), 'category'] = category

# find fits for the categories based on all data
fits = {}
ratios = {}
for category in df['category'].unique():
    idata = df.loc[(df['category'] == category)]
    fits[category] = smf.ols(formula='delta_rate ~ delta_series_var', data=idata).fit()
    ratios[category] = fits[category].params['delta_series_var']

ratio_df = pd.DataFrame.from_dict(ratios, orient="index", columns=["rate"])
ratio_df['ratio'] = ratio_df['rate']/ratio_df['rate'].sum()
ratio_df['ratio/2'] = ratio_df['ratio']/2
display(ratio_df)


fig = px.scatter(
    df,
    x="delta_series_var",
    y=2*df["delta_rate"]*1000,
    color="category",
    range_x=[-1, 76],
    range_y=[-0.1, 8.2],
    color_discrete_sequence=colors,
    category_orders={"category": ["C2G/G2C", "A2C/T2G", "C2A/G2T", "A2T/T2A", "C2T/G2A", "A2G/T2C"]}
)


x_var = np.linspace(min(df['delta_series_var']), max(df['delta_series_var']), 10)
for i, category in enumerate(categories):
    fig.add_trace(
        pg.Scatter(
            x=x_var,
            y=2*fits[category].predict(exog={'delta_series_var': x_var})*1000,
            mode="lines",
            showlegend=False,
            line=dict(color=cat_color_map[category], width=1.5)
        )
    )




alldf = data.data[f"overall_error_rates"].loc[data.data[f"overall_error_rates"]['type'] == "substitutions"].copy()
alldf['delta_rate'] = alldf['rate']
for group in alldf['group'].unique():
    for read in alldf['read'].unique():
        idata = alldf.loc[(alldf['group'] == group) & (alldf['read'] == read)]
        fit = smf.ols(formula='rate ~ delta_series_var', data=idata).fit()
        alldf.loc[(alldf['group'] == group) & (alldf['read'] == read), 'delta_rate'] -= fit.params['Intercept']

fit = smf.ols(formula='delta_rate ~ delta_series_var', data=alldf).fit()

fig.add_trace(
    pg.Scatter(
        x=alldf['delta_series_var'],
        y=alldf['delta_rate']*1000,
        mode="markers",
        showlegend=False,
        line=dict(color="#222222")
    )
)
fig.add_trace(
    pg.Scatter(
        x=x_var,
        y=fit.predict(exog={'delta_series_var': x_var})*1000,
        mode="lines",
        showlegend=False,
        line=dict(color="#222222", width=2.5)
    )
)

fig.add_annotation(x=30, y=7,
    text=f"{fit.params['delta_series_var']*10000:.2f} · 10<sup>-4</sup> nt<sup>-1</sup> cy<sup>-1</sup>",
    showarrow=False,
    yshift=0, 
    font=dict(
        color="#222222",
        family="Inter", 
        size=28/3, 
    )
)

fig.add_annotation(x=62, y=8.05,
    text="total",
    showarrow=False,
    yshift=0, 
    xanchor="left",
    font=dict(
        color="#222222",
        family="Inter", 
        size=28/3, 
    )
)

fig.update_layout(
    template="simple_white", 
    height=240, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=8, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_text='Number of PCR cycles', 
    dtick=15,
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=5
)
fig.update_yaxes(
    title_text='Excess substitution rate / 10<sup>-3</sup> nt<sup>-1</sup>', 
    dtick=2, 
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=1
)
fig.show()
fig.write_image("sub_rate.svg")

Unnamed: 0,rate,ratio,ratio/2
A2C/T2G,1.566196e-06,0.030315,0.015158
A2G/T2C,3.130396e-05,0.605919,0.302959
A2T/T2A,6.425323e-06,0.124368,0.062184
C2A/G2T,1.557571e-06,0.030148,0.015074
C2G/G2C,7.549602e-07,0.014613,0.007306
C2T/G2A,1.005561e-05,0.194636,0.097318


In [4]:
factor_df = ratio_df.copy()
factor_df['group'] = 1
factor_df.sort_values("ratio", inplace=True, ascending=False)

fig = px.bar(
    factor_df,
    y='group',
    x='ratio',
    color=factor_df.index,
    range_x=[0, 1],
    color_discrete_map=cat_color_map,
    orientation='h',
)

fig.update_layout(
    template="simple_white", 
    height=85, 
    width=330, 
    showlegend=False, 
    coloraxis_showscale=False,
    margin=dict(l=33, r=0, t=30, b=20),
)
fig.update_xaxes(
    title_text='Fraction of substitutions', 
    tickformat=",.0%", 
    dtick=0.25, 
    side='top',
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=0.125
)
fig.update_yaxes(visible=False)

fig.write_image("base_bias.svg")
fig.show()
factor_df

Unnamed: 0,rate,ratio,ratio/2,group
A2G/T2C,3.130396e-05,0.605919,0.302959,1
C2T/G2A,1.005561e-05,0.194636,0.097318,1
A2T/T2A,6.425323e-06,0.124368,0.062184,1
A2C/T2G,1.566196e-06,0.030315,0.015158,1
C2A/G2T,1.557571e-06,0.030148,0.015074,1
C2G/G2C,7.549602e-07,0.014613,0.007306,1
