In [12]:
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as pg

def anova(idata, errortype):
    errordata = idata.data['overall_error_rates'].loc[(idata.data['overall_error_rates']['type'] == errortype)].copy()
    errordata[['synthesis', 'GC', 'PCRC', 'Decay']] = errordata.group.str.split('_', expand=True)
    errordata = errordata.astype({'PCRC': 'int', 'Decay': 'int'})

    model = smf.ols('rate ~ C(synthesis) + C(GC) + C(synthesis):C(GC) + PCRC + Decay', data=errordata).fit()
    display(model.summary())
    aov = sm.stats.anova_lm(model, typ=2, robust='hc3')
    aov['eta_sq'] = aov[:]['sum_sq']/sum(aov['sum_sq'])
    aov2 = aov.copy()
    aov2['eta_sq'] = [f"{i*100:0.1f}%" for i in aov['eta_sq']]
    display(aov2)
    return aov

In [3]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript_GCfix_36_0', analysis.ErrorAnalysis("../../data/Aging/0a_Genscript_GCfix")),
    ('Genscript_GCfix_42_0', analysis.ErrorAnalysis("../../data/Aging/0b_Genscript_GCfix")),
    ('Genscript_GCfix_42_2', analysis.ErrorAnalysis("../../data/Aging/2d_Genscript_GCfix")),
    ('Genscript_GCfix_42_4', analysis.ErrorAnalysis("../../data/Aging/4d_Genscript_GCfix")),
    ('Genscript_GCfix_42_7', analysis.ErrorAnalysis("../../data/Aging/7d_Genscript_GCfix")),
    ('Genscript_GCfix_10_0', analysis.ErrorAnalysis("../../data/PCR/10c_Genscript_GCfix")),
    ('Genscript_GCfix_15_0', analysis.ErrorAnalysis("../../data/PCR/15c_Genscript_GCfix")),
    ('Genscript_GCfix_20_0', analysis.ErrorAnalysis("../../data/PCR/20c_Genscript_GCfix")),
    ('Genscript_GCfix_25_0', analysis.ErrorAnalysis("../../data/PCR/25c_Genscript_GCfix")),
    ('Genscript_GCall_36_0', analysis.ErrorAnalysis("../../data/Aging/0a_Genscript_GCall")),
    ('Genscript_GCall_42_0', analysis.ErrorAnalysis("../../data/Aging/0b_Genscript_GCall")),
    ('Genscript_GCall_42_2', analysis.ErrorAnalysis("../../data/Aging/2d_Genscript_GCall")),
    ('Genscript_GCall_42_4', analysis.ErrorAnalysis("../../data/Aging/4d_Genscript_GCall")),
    ('Genscript_GCall_42_7', analysis.ErrorAnalysis("../../data/Aging/7d_Genscript_GCall")),
    ('Genscript_GCall_15_0', analysis.ErrorAnalysis("../../data/PCR/15c_Genscript_GCall")),
    ('Genscript_GCall_20_0', analysis.ErrorAnalysis("../../data/PCR/20c_Genscript_GCall")),
    ('Genscript_GCall_25_0', analysis.ErrorAnalysis("../../data/PCR/25c_Genscript_GCall")),
    ('Genscript_GCall_30_0', analysis.ErrorAnalysis("../../data/PCR/30c_Genscript_GCall")),
    ('Twist_GCall_15_0', analysis.ErrorAnalysis("../../data/PCR/15c_Twist_GCall")),
    ('Twist_GCall_30_0', analysis.ErrorAnalysis("../../data/PCR/30c_Twist_GCall")),
    ('Twist_GCall_45_0', analysis.ErrorAnalysis("../../data/PCR/45c_Twist_GCall")),
    ('Twist_GCall_60_0', analysis.ErrorAnalysis("../../data/PCR/60c_Twist_GCall")),
    ('Twist_GCall_75_0', analysis.ErrorAnalysis("../../data/PCR/75c_Twist_GCall")),
    ('Twist_GCall_90_0', analysis.ErrorAnalysis("../../data/PCR/90c_Twist_GCall")),
    ('Twist_GCall_37_0', analysis.ErrorAnalysis("../../data/Aging/0a_Twist_GCall")),
    ('Twist_GCall_43_0', analysis.ErrorAnalysis("../../data/Aging/0b_Twist_GCall")),
    ('Twist_GCall_43_2', analysis.ErrorAnalysis("../../data/Aging/2d_Twist_GCall")),
    ('Twist_GCall_43_4', analysis.ErrorAnalysis("../../data/Aging/4d_Twist_GCall")),
    ('Twist_GCall_43_7', analysis.ErrorAnalysis("../../data/Aging/7d_Twist_GCall")),
    ('Twist_GCfix_15_0', analysis.ErrorAnalysis("../../data/PCR/15c_Twist_GCfix")),
    ('Twist_GCfix_30_0', analysis.ErrorAnalysis("../../data/PCR/30c_Twist_GCfix")),
    ('Twist_GCfix_45_0', analysis.ErrorAnalysis("../../data/PCR/45c_Twist_GCfix")),
    ('Twist_GCfix_60_0', analysis.ErrorAnalysis("../../data/PCR/60c_Twist_GCfix")),
    ('Twist_GCfix_75_0', analysis.ErrorAnalysis("../../data/PCR/75c_Twist_GCfix")),
    ('Twist_GCfix_90_0', analysis.ErrorAnalysis("../../data/PCR/90c_Twist_GCfix")),
    ('Twist_GCfix_39_0', analysis.ErrorAnalysis("../../data/Aging/0a_Twist_GCfix")),
    ('Twist_GCfix_45_0', analysis.ErrorAnalysis("../../data/Aging/0b_Twist_GCfix")),
    ('Twist_GCfix_45_2', analysis.ErrorAnalysis("../../data/Aging/2d_Twist_GCfix")),
    ('Twist_GCfix_45_4', analysis.ErrorAnalysis("../../data/Aging/4d_Twist_GCfix")),
    ('Twist_GCfix_45_7', analysis.ErrorAnalysis("../../data/Aging/7d_Twist_GCfix")),
])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
def summary(group):
        d = {}
        d['mean'] = group.rate.mean()
        d['std'] = group.rate.std()
        d['min'] = group.rate.min()
        d['max'] = group.rate.max()
        return pd.Series(d, index=['mean', 'std', 'min', 'max'])

errordata = data.data['overall_error_rates'].copy()
errordata[['synthesis', 'GC', 'PCRC', 'Decay']] = errordata.group.str.split('_', expand=True)

In [13]:
del_table = anova(data, 'deletions')

0,1,2,3
Dep. Variable:,rate,R-squared:,0.994
Model:,OLS,Adj. R-squared:,0.994
Method:,Least Squares,F-statistic:,2453.0
Date:,"Thu, 23 Mar 2023",Prob (F-statistic):,1.0700000000000001e-80
Time:,11:12:22,Log-Likelihood:,489.23
No. Observations:,80,AIC:,-966.5
Df Residuals:,74,BIC:,-952.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0121,0.000,66.366,0.000,0.012,0.012
C(synthesis)[T.Twist],-0.0112,0.000,-60.403,0.000,-0.012,-0.011
C(GC)[T.GCfix],0.0044,0.000,23.561,0.000,0.004,0.005
C(synthesis)[T.Twist]:C(GC)[T.GCfix],-0.0044,0.000,-17.402,0.000,-0.005,-0.004
PCRC,-6.036e-06,3.82e-06,-1.580,0.118,-1.36e-05,1.58e-06
Decay,-4.944e-06,2.74e-05,-0.180,0.857,-5.96e-05,4.97e-05

0,1,2,3
Omnibus:,7.968,Durbin-Watson:,0.567
Prob(Omnibus):,0.019,Jarque-Bera (JB):,9.72
Skew:,-0.453,Prob(JB):,0.00775
Kurtosis:,4.448,Cond. No.,241.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),0.003218085,1.0,10421.217916,2.250311e-81,96.9%
C(GC),6.273956e-06,1.0,20.317132,2.410631e-05,0.2%
C(synthesis):C(GC),7.313177e-05,1.0,236.824723,9.154429e-25,2.2%
PCRC,9.052362e-07,1.0,2.931452,0.09105727,0.0%
Decay,1.982498e-08,1.0,0.0642,0.8006797,0.0%
Residual,2.285129e-05,74.0,,,0.7%


In [9]:
sub_table = anova(data, 'substitutions')

0,1,2,3
Dep. Variable:,rate,R-squared:,0.936
Model:,OLS,Adj. R-squared:,0.932
Method:,Least Squares,F-statistic:,217.5
Date:,"Thu, 23 Mar 2023",Prob (F-statistic):,9.18e-43
Time:,11:04:41,Log-Likelihood:,496.09
No. Observations:,80,AIC:,-980.2
Df Residuals:,74,BIC:,-965.9
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0051,0.000,30.515,0.000,0.005,0.005
C(synthesis)[T.Twist],-0.0026,0.000,-15.059,0.000,-0.003,-0.002
C(GC)[T.GCfix],-0.0013,0.000,-7.496,0.000,-0.002,-0.001
C(synthesis)[T.Twist]:C(GC)[T.GCfix],0.0012,0.000,5.282,0.000,0.001,0.002
PCRC,0.0001,3.51e-06,30.670,0.000,0.000,0.000
Decay,0.0001,2.52e-05,4.551,0.000,6.44e-05,0.000

0,1,2,3
Omnibus:,3.534,Durbin-Watson:,1.855
Prob(Omnibus):,0.171,Jarque-Bera (JB):,2.753
Skew:,0.377,Prob(JB):,0.252
Kurtosis:,3.506,Cond. No.,241.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),6.1e-05,1.0,233.158045,1.4226579999999999e-24,11.2%
C(GC),2e-06,1.0,8.183014,0.005492861,0.4%
C(synthesis):C(GC),6e-06,1.0,23.283556,7.30905e-06,1.1%
PCRC,0.000448,1.0,1724.177762,5.0992230000000005e-53,82.5%
Decay,7e-06,1.0,25.977159,2.56299e-06,1.2%
Residual,1.9e-05,74.0,,,3.5%


In [31]:
fig = px.box(
    errordata,
    x="GC",
    y="rate",
    color="synthesis",
    facet_col="type",
    color_discrete_sequence=["#3182bd", "#de2d26"]
)


fig.update_layout(
    template='simple_white',
    height=250,
    width=500,
    showlegend=False,
    margin=dict(l=0, r=10, t=20, b=00),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(
    title_font_size=28/3, 
    tickfont_size=28/3
)
fig.update_yaxes(
    range=[0, 0.0175], 
    title_font_size=28/3, 
    tickfont_size=28/3, 
    minor_ticks="outside", 
    minor_dtick=0.0025
)