In [1]:
import numpy as np
from scipy.optimize import curve_fit
import plotly.graph_objects as pg
import plotly.express as px
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import statsmodels.api as sm

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.SeriesAnalysis([
    ('Genscript_GCall', 0, analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCall")),
    ('Genscript_GCall', 2.10, analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCall")),
    ('Genscript_GCall', 4.24, analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCall")),
    ('Genscript_GCall', 7.09, analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCall")),
    ('Genscript_GCfix', 0, analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCfix")),
    ('Genscript_GCfix', 2.22, analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCfix")),
    ('Genscript_GCfix', 5.17, analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCfix")),
    ('Genscript_GCfix', 7.18, analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCfix")),
    ('Twist_GCall', 0, analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCall")),
    ('Twist_GCall', 2.02, analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCall")),
    ('Twist_GCall', 4.04, analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCall")),
    ('Twist_GCall', 7.03, analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCall")),
    ('Twist_GCfix', 0, analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCfix")),
    ('Twist_GCfix', 2.44, analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCfix")),
    ('Twist_GCfix', 4.89, analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCfix")),
    ('Twist_GCfix', 8.50, analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCfix")),
])

In [3]:
df = data.data[f"overall_error_rates"].copy()
df.drop(df.loc[df['type'] == "delevents"].index, inplace=True)
df.drop(df.loc[df['type'] == "deletions"].index, inplace=True)
df.drop(df.loc[df['type'] == "insertions"].index, inplace=True)


colors = ['#238b45', '#74c476', '#bae4b3']


df['delta_rate'] = df['rate']
for group in df['group'].unique():
    for errortype in df['type'].unique():
        for read in df['read'].unique():
            idata = df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read)]
            baseline = df.loc[(df['delta_series_var'] == 0) & (df['group'] == group) & (df['type'] == errortype) & (df['read'] == read), 'rate'].mean()
            df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read), 'delta_rate'] -= baseline

df[['prov', 'GC']] = df['group'].str.split("_", expand=True)

fig = px.scatter(
    df, 
    x=df['delta_series_var'], 
    y=df['delta_rate']*1000, 
    color='prov',
    symbol='GC', 
    trendline='ols',
    trendline_scope="overall",
    trendline_options={"add_constant": False},
    trendline_color_override="#444444",
    color_discrete_sequence=['#238b45', '#74c476', '#bae4b3']
)
fig.add_annotation(
    x=2, y=0.9,
    text=f"{10*px.get_trendline_results(fig).px_fit_results[0].params[0]:.2f} · 10<sup>-4</sup> nt<sup>-1</sup> τ<sup>-1</sup>",
    showarrow=False,
    font={'family': "Inter", 'size': 28/3}
)


fig.update_layout(
    template="simple_white", 
    height=220, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=8, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_text='Storage duration / τ',
    title_font_family="Inter", 
    title_font_size=28/3, 
    range=[-0.1, 9],
    dtick=2,
    minor_ticks="outside", 
    minor_dtick=1,
    tickfont_size=28/3, 
)
fig.update_yaxes(
    title_text='Excess substitution rate / 10<sup>-3</sup> nt<sup>-1</sup>',
    title_font_family="Inter", 
    title_font_size=28/3, 
    range=[-0.02, 1.5],
    dtick=0.5,
    minor_ticks="outside", 
    minor_dtick=0.25,
    tickfont_size=28/3, 
)


fig.show()
fig.write_image("error_rates.svg")

In [4]:
df = data.data[f"overall_error_rates"].copy()
df.drop(df.loc[df['type'] == "delevents"].index, inplace=True)
df.drop(df.loc[df['type'] == "substitutions"].index, inplace=True)
df.drop(df.loc[df['type'] == "insertions"].index, inplace=True)


colors = ['#238b45', '#74c476', '#bae4b3']


df['delta_rate'] = df['rate']
for group in df['group'].unique():
    for errortype in df['type'].unique():
        for read in df['read'].unique():
            idata = df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read)]
            baseline = df.loc[(df['delta_series_var'] == 0) & (df['group'] == group) & (df['type'] == errortype) & (df['read'] == read), 'rate'].mean()
            df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read), 'delta_rate'] -= baseline

df[['prov', 'GC']] = df['group'].str.split("_", expand=True)

fig = px.scatter(
    df, 
    x=df['delta_series_var'], 
    y=df['delta_rate']*1000, 
    color='prov',
    symbol='GC', 
    trendline='ols',
    trendline_scope="overall",
    trendline_options={"add_constant": False},
    trendline_color_override="#444444",
    color_discrete_sequence=['#238b45', '#74c476', '#bae4b3']
)
fig.add_annotation(
    x=2, y=0.9,
    text=f"{10*px.get_trendline_results(fig).px_fit_results[0].params[0]:.2f} · 10<sup>-4</sup> nt<sup>-1</sup> τ<sup>-1</sup>",
    showarrow=False,
    font={'family': "Inter", 'size': 28/3}
)


fig.update_layout(
    template="simple_white", 
    height=220, 
    width=330, 
    showlegend=False, 
    margin=dict(l=0, r=8, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_text='Storage duration / τ',
    title_font_family="Inter", 
    title_font_size=28/3, 
    range=[-0.1, 9],
    dtick=2,
    minor_ticks="outside", 
    minor_dtick=1,
    tickfont_size=28/3, 
)
fig.update_yaxes(
    title_text='Excess deletion rate / 10<sup>-3</sup> nt<sup>-1</sup>',
    title_font_family="Inter", 
    title_font_size=28/3, 
    range=[-0.02, 1.5],
    dtick=0.5,
    minor_ticks="outside", 
    minor_dtick=0.25,
    tickfont_size=28/3, 
)


fig.show()
fig.write_image("error_rates_deletions.svg")

In [5]:
df = data.data[f"substitutions_by_type"].copy()

error_categories = {
    'A2C': "A2C/T2G", 
    'A2G': "A2G/T2C", 
    'A2T': "A2T/T2A", 
    'C2A': "C2A/G2T", 
    'C2G': "C2G/G2C", 
    'C2T': "C2T/G2A", 
    'T2G': "A2C/T2G", 
    'T2C': "A2G/T2C", 
    'T2A': "A2T/T2A", 
    'G2T': "C2A/G2T", 
    'G2C': "C2G/G2C", 
    'G2A': "C2T/G2A"
}

# find intercept to normalize initial rates for each experiment group
df['delta_rate'] = df['rate']
for group in df['group'].unique():
    for errortype in df['type'].unique():
        for read in df['read'].unique():
            idata = df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read)]
            fit = smf.ols(formula='rate ~ delta_series_var', data=idata).fit()
            df.loc[(df['group'] == group) & (df['type'] == errortype) & (df['read'] == read), 'delta_rate'] -= fit.params['Intercept']

# combine analogous types into categories
df['category'] = df['type']
for error, category in error_categories.items():
    df.loc[(df['type'] == error), 'category'] = category

# find fits for the categories based on all data
fits = {}
ratios = {}
for category in df['category'].unique():
    idata = df.loc[(df['category'] == category)]
    fits[category] = smf.ols(formula='delta_rate ~ delta_series_var', data=idata).fit()
    ratios[category] = fits[category].params['delta_series_var']

ratio_df = pd.DataFrame.from_dict(ratios, orient="index", columns=["rate"])
ratio_df['ratio'] = ratio_df['rate']/ratio_df['rate'].sum()
ratio_df['ratio/2'] = ratio_df['ratio']/2
display(ratio_df)


factor_df = ratio_df.copy()
factor_df['group'] = 1
factor_df.sort_values("ratio", inplace=True, ascending=False)

cat_color_map = {
    "C2G/G2C": '#bae4b3', 
    "A2C/T2G": '#525252', 
    "C2A/G2T": '#74c476', 
    "A2T/T2A": '#cccccc', 
    "C2T/G2A": '#238b45', 
    "A2G/T2C": '#969696'
}

fig = px.bar(
    factor_df,
    y='group',
    x='ratio',
    color=factor_df.index,
    range_x=[0, 1],
    color_discrete_map=cat_color_map,
    orientation='h',
)

fig.update_layout(
    template="simple_white", 
    height=85, 
    width=330, 
    showlegend=False, 
    coloraxis_showscale=False,
    margin=dict(l=36, r=0, t=30, b=20),
)
fig.update_xaxes(
    title_text='Fraction of substitutions', 
    tickformat=",.0%", 
    dtick=0.25, 
    side='top',
    title_font_family="Inter", 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=0.125
)
fig.update_yaxes(visible=False)

fig.write_image("base_bias.svg")
fig.show()
factor_df

Unnamed: 0,rate,ratio,ratio/2
A2C/T2G,4e-06,0.050007,0.025004
A2G/T2C,3e-06,0.035107,0.017553
A2T/T2A,1e-06,0.015668,0.007834
C2A/G2T,5e-06,0.071451,0.035726
C2G/G2C,5e-06,0.063691,0.031845
C2T/G2A,5.5e-05,0.764076,0.382038


Unnamed: 0,rate,ratio,ratio/2,group
C2T/G2A,5.5e-05,0.764076,0.382038,1
C2A/G2T,5e-06,0.071451,0.035726,1
C2G/G2C,5e-06,0.063691,0.031845,1
A2C/T2G,4e-06,0.050007,0.025004,1
A2G/T2C,3e-06,0.035107,0.017553,1
A2T/T2A,1e-06,0.015668,0.007834,1
