In [2]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as pg

seq_sub_rate = (0.0011+0.0025)/2

In [3]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.SeriesAnalysis([
    ('Genscript_GCall', 15, analysis.ErrorAnalysis("../../data/PCR/15c_Genscript_GCall")),
    ('Genscript_GCall', 20, analysis.ErrorAnalysis("../../data/PCR/20c_Genscript_GCall")),
    ('Genscript_GCall', 25, analysis.ErrorAnalysis("../../data/PCR/25c_Genscript_GCall")),
    ('Genscript_GCall', 30, analysis.ErrorAnalysis("../../data/PCR/30c_Genscript_GCall")),
    ('Genscript_GCfix', 10, analysis.ErrorAnalysis("../../data/PCR/10c_Genscript_GCfix")),
    ('Genscript_GCfix', 15, analysis.ErrorAnalysis("../../data/PCR/15c_Genscript_GCfix")),
    ('Genscript_GCfix', 20, analysis.ErrorAnalysis("../../data/PCR/20c_Genscript_GCfix")),
    ('Genscript_GCfix', 25, analysis.ErrorAnalysis("../../data/PCR/25c_Genscript_GCfix")),
    ('Twist_GCall', 15, analysis.ErrorAnalysis("../../data/PCR/15c_Twist_GCall")),
    ('Twist_GCall', 30, analysis.ErrorAnalysis("../../data/PCR/30c_Twist_GCall")),
    ('Twist_GCall', 45, analysis.ErrorAnalysis("../../data/PCR/45c_Twist_GCall")),
    ('Twist_GCall', 60, analysis.ErrorAnalysis("../../data/PCR/60c_Twist_GCall")),
    ('Twist_GCall', 75, analysis.ErrorAnalysis("../../data/PCR/75c_Twist_GCall")),
    ('Twist_GCall', 90, analysis.ErrorAnalysis("../../data/PCR/90c_Twist_GCall")),
    ('Twist_GCfix', 15, analysis.ErrorAnalysis("../../data/PCR/15c_Twist_GCfix")),
    ('Twist_GCfix', 30, analysis.ErrorAnalysis("../../data/PCR/30c_Twist_GCfix")),
    ('Twist_GCfix', 45, analysis.ErrorAnalysis("../../data/PCR/45c_Twist_GCfix")),
    ('Twist_GCfix', 60, analysis.ErrorAnalysis("../../data/PCR/60c_Twist_GCfix")),
    ('Twist_GCfix', 75, analysis.ErrorAnalysis("../../data/PCR/75c_Twist_GCfix")),
    ('Twist_GCfix', 90, analysis.ErrorAnalysis("../../data/PCR/90c_Twist_GCfix")),
])

In [4]:
alldf = data.data[f"overall_error_rates"].loc[data.data[f"overall_error_rates"]['type'] == "substitutions"].copy()
alldf[['synthesis', 'GC']] = alldf.group.str.split('_', expand=True)

In [5]:
colors_lines = ["#08519c", "#a50f15"]
colors_points = ["#6baed6", "#fb6a4a"]





fig = px.scatter(
    alldf,
    x="series_var",
    y="rate",
    color="synthesis",
    symbol="read",
    color_discrete_sequence=colors_points
)

x_var = np.linspace(0, 90, 10)

for i, synthesis in enumerate(alldf.synthesis.unique()):
    window = alldf.loc[alldf.synthesis == synthesis]
    fit = smf.ols(formula='rate ~ series_var', data=window).fit()
    display(fit.summary())
    display(fit.summary2().tables[1])

    fig.add_trace(
        pg.Scatter(
            x=x_var,
            y=fit.predict(exog={'series_var': x_var}),
            mode="lines",
            showlegend=False,
            line=dict(color=colors_lines[i], width=2.5)
        )
    )


fig.add_hline(y=seq_sub_rate, line_width=2, line_dash="dash")





fig.update_layout(
    template='simple_white',
    height=330,
    width=330,
    showlegend=False,
    margin=dict(l=0, r=10, t=20, b=00),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title="PCR cycles",
    range=[0, 31],
    title_font_size=28/3, 
    tickfont_size=28/3
)
fig.update_yaxes(
    title="Substitution rate / 10<sup>-3</sup> nt<sup>-1</sup>",
    range=[0, 0.012], 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=0.001
)

fig.show()
fig.write_image("init_substitutions.svg")


kurtosistest only valid for n>=20 ... continuing anyway, n=16



0,1,2,3
Dep. Variable:,rate,R-squared:,0.884
Model:,OLS,Adj. R-squared:,0.876
Method:,Least Squares,F-statistic:,106.7
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,6.23e-08
Time:,12:36:18,Log-Likelihood:,100.17
No. Observations:,16,AIC:,-196.3
Df Residuals:,14,BIC:,-194.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0027,0.000,6.462,0.000,0.002,0.004
series_var,0.0002,2.02e-05,10.331,0.000,0.000,0.000

0,1,2,3
Omnibus:,1.504,Durbin-Watson:,3.427
Prob(Omnibus):,0.471,Jarque-Bera (JB):,1.099
Skew:,0.406,Prob(JB):,0.577
Kurtosis:,2.006,Cond. No.,71.6



kurtosistest only valid for n>=20 ... continuing anyway, n=16



Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.002726,0.000422,6.462401,1.491703e-05,0.001821,0.003631
series_var,0.000208,2e-05,10.330715,6.233656e-08,0.000165,0.000252


0,1,2,3
Dep. Variable:,rate,R-squared:,0.991
Model:,OLS,Adj. R-squared:,0.991
Method:,Least Squares,F-statistic:,2560.0
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,2.9e-24
Time:,12:36:18,Log-Likelihood:,164.08
No. Observations:,24,AIC:,-324.2
Df Residuals:,22,BIC:,-321.8
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0025,0.000,19.599,0.000,0.002,0.003
series_var,0.0001,2.16e-06,50.594,0.000,0.000,0.000

0,1,2,3
Omnibus:,1.352,Durbin-Watson:,2.853
Prob(Omnibus):,0.509,Jarque-Bera (JB):,0.906
Skew:,-0.062,Prob(JB):,0.636
Kurtosis:,2.056,Cond. No.,133.0


Unnamed: 0,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
Intercept,0.002475,0.000126,19.598768,2.034264e-15,0.002213,0.002737
series_var,0.000109,2e-06,50.594325,2.904193e-24,0.000105,0.000114
