In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as pg

def anova(idata, errortype):
    errordata = idata.data['overall_error_rates'].loc[(idata.data['overall_error_rates']['type'] == errortype)].copy()
    errordata[['synthesis', 'GC', 'PCRC', 'Decay']] = errordata.group.str.split('_', expand=True)
    errordata = errordata.astype({'PCRC': 'int', 'Decay': 'int'})
    display(errordata)

    model = smf.ols('rate ~ C(synthesis) + PCRC + Decay', data=errordata).fit()
    display(model.summary())
    aov = sm.stats.anova_lm(model, typ=2, robust='hc3')
    aov['eta_sq'] = aov[:]['sum_sq']/sum(aov['sum_sq'])
    aov2 = aov.copy()
    aov2['eta_sq'] = [f"{i*100:0.1f}%" for i in aov['eta_sq']]
    display(aov2)
    return aov

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript_GCfix_36_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_Aging/0a")),
    ('Genscript_GCfix_42_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_Aging/0b")),
    ('Genscript_GCfix_42_2', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_Aging/2d")),
    ('Genscript_GCfix_42_4', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_Aging/4d")),
    ('Genscript_GCfix_42_7', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_Aging/7d")),
    ('Genscript_GCfix_10_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_PCR/PCR0")),
    ('Genscript_GCfix_15_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_PCR/PCR1")),
    ('Genscript_GCfix_20_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_PCR/PCR2")),
    ('Genscript_GCfix_25_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCfix_PCR/PCR3")),
    ('Genscript_GCall_36_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_Aging/0a")),
    ('Genscript_GCall_42_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_Aging/0b")),
    ('Genscript_GCall_42_2', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_Aging/2d")),
    ('Genscript_GCall_42_4', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_Aging/4d")),
    ('Genscript_GCall_42_7', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_Aging/7d")),
    ('Genscript_GCall_15_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_PCR/PCR0")),
    ('Genscript_GCall_20_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_PCR/PCR1")),
    ('Genscript_GCall_25_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_PCR/PCR2")),
    ('Genscript_GCall_30_0', analysis.ErrorAnalysis("../../../data/internal_validation/Genscript_GCall_PCR/PCR3")),
    ('Twist_GCall_15_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR0")),
    ('Twist_GCall_30_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR1")),
    ('Twist_GCall_45_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR2")),
    ('Twist_GCall_60_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR3")),
    ('Twist_GCall_75_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR4")),
    ('Twist_GCall_90_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_PCR/PCR5")),
    ('Twist_GCall_37_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_Aging/0a")),
    ('Twist_GCall_43_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_Aging/0b")),
    ('Twist_GCall_43_2', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_Aging/2d")),
    ('Twist_GCall_43_4', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_Aging/4d")),
    ('Twist_GCall_43_7', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCall_Aging/7d")),
    ('Twist_GCfix_15_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR0")),
    ('Twist_GCfix_30_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR1")),
    ('Twist_GCfix_45_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR2")),
    ('Twist_GCfix_60_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR3")),
    ('Twist_GCfix_75_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR4")),
    ('Twist_GCfix_90_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_PCR/PCR5")),
    ('Twist_GCfix_39_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_Aging/0a")),
    ('Twist_GCfix_45_0', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_Aging/0b")),
    ('Twist_GCfix_45_2', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_Aging/2d")),
    ('Twist_GCfix_45_4', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_Aging/4d")),
    ('Twist_GCfix_45_7', analysis.ErrorAnalysis("../../../data/internal_validation/Twist_GCfix_Aging/7d")),
])

In [3]:
def summary(group):
        d = {}
        d['mean'] = group.rate.mean()
        d['std'] = group.rate.std()
        d['min'] = group.rate.min()
        d['max'] = group.rate.max()
        return pd.Series(d, index=['mean', 'std', 'min', 'max'])

errordata = data.data['overall_error_rates'].copy()
errordata[['synthesis', 'GC', 'PCRC', 'Decay']] = errordata.group.str.split('_', expand=True)
display(errordata.groupby(['type']).apply(summary))
display(errordata.groupby(['synthesis', 'type']).apply(summary))

Unnamed: 0_level_0,mean,std,min,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
deletions,0.006395,0.006602,0.000563,0.015513
delevents,0.0051,0.00529,0.000386,0.012121
insertions,1.8e-05,1.1e-05,2e-06,4.3e-05
substitutions,0.007258,0.002381,0.00251,0.011767


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min,max
synthesis,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Genscript,deletions,0.013497,0.002011,0.01143,0.015513
Genscript,delevents,0.010838,0.001263,0.009498,0.012121
Genscript,insertions,2.1e-05,1.2e-05,2e-06,4.3e-05
Genscript,substitutions,0.007832,0.002565,0.002854,0.011767
Twist,deletions,0.000584,1.5e-05,0.000563,0.00062
Twist,delevents,0.000405,1.3e-05,0.000386,0.000439
Twist,insertions,1.5e-05,9e-06,2e-06,3.9e-05
Twist,substitutions,0.006787,0.002134,0.00251,0.011501


In [4]:
del_table = anova(data, 'deletions')

Unnamed: 0,type,rate,read,exp,group,synthesis,GC,PCRC,Decay
1,deletions,0.015469,1,0a,Genscript_GCfix_36_0,Genscript,GCfix,36,0
5,deletions,0.015469,2,0a,Genscript_GCfix_36_0,Genscript,GCfix,36,0
9,deletions,0.015492,1,0b,Genscript_GCfix_42_0,Genscript,GCfix,42,0
13,deletions,0.015494,2,0b,Genscript_GCfix_42_0,Genscript,GCfix,42,0
17,deletions,0.015494,1,2d,Genscript_GCfix_42_2,Genscript,GCfix,42,2
...,...,...,...,...,...,...,...,...,...
301,deletions,0.000579,2,2d,Twist_GCfix_45_2,Twist,GCfix,45,2
305,deletions,0.000590,1,4d,Twist_GCfix_45_4,Twist,GCfix,45,4
309,deletions,0.000604,2,4d,Twist_GCfix_45_4,Twist,GCfix,45,4
313,deletions,0.000603,1,7d,Twist_GCfix_45_7,Twist,GCfix,45,7


0,1,2,3
Dep. Variable:,rate,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.957
Method:,Least Squares,F-statistic:,592.7
Date:,"Thu, 22 Jun 2023",Prob (F-statistic):,1.3199999999999998e-52
Time:,12:36:13,Log-Likelihood:,416.4
No. Observations:,80,AIC:,-824.8
Df Residuals:,76,BIC:,-815.3
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0136,0.000,36.169,0.000,0.013,0.014
C(synthesis)[T.Twist],-0.0128,0.000,-37.296,0.000,-0.014,-0.012
PCRC,-4.009e-06,9.36e-06,-0.428,0.670,-2.26e-05,1.46e-05
Decay,3.355e-07,6.73e-05,0.005,0.996,-0.000,0.000

0,1,2,3
Omnibus:,4.031,Durbin-Watson:,0.138
Prob(Omnibus):,0.133,Jarque-Bera (JB):,2.051
Skew:,0.0,Prob(JB):,0.359
Kurtosis:,2.216,Cond. No.,112.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),0.002348233,1.0,1264.461596,4.052241e-49,94.3%
PCRC,8.012629e-07,1.0,0.431459,0.5132581,0.0%
Decay,4.106751e-11,1.0,2.2e-05,0.9962603,0.0%
Residual,0.0001411397,76.0,,,5.7%


In [5]:
sub_table = anova(data, 'substitutions')

Unnamed: 0,type,rate,read,exp,group,synthesis,GC,PCRC,Decay
0,substitutions,0.007747,1,0a,Genscript_GCfix_36_0,Genscript,GCfix,36,0
4,substitutions,0.008949,2,0a,Genscript_GCfix_36_0,Genscript,GCfix,36,0
8,substitutions,0.008759,1,0b,Genscript_GCfix_42_0,Genscript,GCfix,42,0
12,substitutions,0.009955,2,0b,Genscript_GCfix_42_0,Genscript,GCfix,42,0
16,substitutions,0.009330,1,2d,Genscript_GCfix_42_2,Genscript,GCfix,42,2
...,...,...,...,...,...,...,...,...,...
300,substitutions,0.007489,2,2d,Twist_GCfix_45_2,Twist,GCfix,45,2
304,substitutions,0.006227,1,4d,Twist_GCfix_45_4,Twist,GCfix,45,4
308,substitutions,0.007441,2,4d,Twist_GCfix_45_4,Twist,GCfix,45,4
312,substitutions,0.006991,1,7d,Twist_GCfix_45_7,Twist,GCfix,45,7


0,1,2,3
Dep. Variable:,rate,R-squared:,0.874
Model:,OLS,Adj. R-squared:,0.869
Method:,Least Squares,F-statistic:,175.9
Date:,"Thu, 22 Jun 2023",Prob (F-statistic):,4.14e-34
Time:,12:36:13,Log-Likelihood:,453.11
No. Observations:,80,AIC:,-898.2
Df Residuals:,76,BIC:,-888.7
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0036,0.000,15.153,0.000,0.003,0.004
C(synthesis)[T.Twist],-0.0030,0.000,-13.652,0.000,-0.003,-0.003
PCRC,0.0001,5.91e-06,20.464,0.000,0.000,0.000
Decay,0.0003,4.25e-05,6.615,0.000,0.000,0.000

0,1,2,3
Omnibus:,2.296,Durbin-Watson:,2.286
Prob(Omnibus):,0.317,Jarque-Bera (JB):,1.59
Skew:,-0.114,Prob(JB):,0.451
Kurtosis:,2.348,Cond. No.,112.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),0.000129,1.0,174.359601,2.300086e-21,28.3%
PCRC,0.00024,1.0,324.141374,3.902592e-29,52.6%
Decay,3.1e-05,1.0,41.906313,8.465637e-09,6.8%
Residual,5.6e-05,76.0,,,12.3%


In [6]:
ins_table = anova(data, 'insertions')

Unnamed: 0,type,rate,read,exp,group,synthesis,GC,PCRC,Decay
2,insertions,0.000017,1,0a,Genscript_GCfix_36_0,Genscript,GCfix,36,0
6,insertions,0.000024,2,0a,Genscript_GCfix_36_0,Genscript,GCfix,36,0
10,insertions,0.000024,1,0b,Genscript_GCfix_42_0,Genscript,GCfix,42,0
14,insertions,0.000031,2,0b,Genscript_GCfix_42_0,Genscript,GCfix,42,0
18,insertions,0.000026,1,2d,Genscript_GCfix_42_2,Genscript,GCfix,42,2
...,...,...,...,...,...,...,...,...,...
302,insertions,0.000018,2,2d,Twist_GCfix_45_2,Twist,GCfix,45,2
306,insertions,0.000012,1,4d,Twist_GCfix_45_4,Twist,GCfix,45,4
310,insertions,0.000017,2,4d,Twist_GCfix_45_4,Twist,GCfix,45,4
314,insertions,0.000016,1,7d,Twist_GCfix_45_7,Twist,GCfix,45,7


0,1,2,3
Dep. Variable:,rate,R-squared:,0.833
Model:,OLS,Adj. R-squared:,0.826
Method:,Least Squares,F-statistic:,126.1
Date:,"Thu, 22 Jun 2023",Prob (F-statistic):,1.9800000000000002e-29
Time:,12:36:13,Log-Likelihood:,873.46
No. Observations:,80,AIC:,-1739.0
Df Residuals:,76,BIC:,-1729.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.322e-06,1.24e-06,1.867,0.066,-1.55e-07,4.8e-06
C(synthesis)[T.Twist],-1.36e-05,1.14e-06,-11.954,0.000,-1.59e-05,-1.13e-05
PCRC,5.108e-07,3.09e-08,16.532,0.000,4.49e-07,5.72e-07
Decay,1.506e-06,2.22e-07,6.779,0.000,1.06e-06,1.95e-06

0,1,2,3
Omnibus:,5.391,Durbin-Watson:,2.341
Prob(Omnibus):,0.068,Jarque-Bera (JB):,3.387
Skew:,0.323,Prob(JB):,0.184
Kurtosis:,2.226,Cond. No.,112.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),2.221736e-09,1.0,109.730704,2.108345e-16,26.8%
PCRC,3.925615e-09,1.0,193.884587,1.305915e-22,47.3%
Decay,6.157382e-10,1.0,30.41107,4.61883e-07,7.4%
Residual,1.538785e-09,76.0,,,18.5%


In [7]:
sub_table['type'] = 'Subs.'
del_table['type'] = 'Dels.'
df = pd.concat([del_table, sub_table])


fig = px.bar(
    df, 
    x="type", 
    y="eta_sq", 
    color=df.index,
    color_discrete_sequence=["#252525", "#636363", "#969696", "#cccccc"],
)


fig.update_layout(
    template='simple_white',
    height=400,
    width=175,
    # showlegend=False,
    legend=dict(
        yanchor="top",
        y=-0.15,
        xanchor="left",
        x=-0.4,
        title_text=''
    )
)
fig.update_yaxes(title_text='Explained variance', tickformat=",.0%", range=[0, 1])
fig.update_xaxes(title_text='')
fig.update_layout(
    height=375,
    width=125,
    margin=dict(l=0, r=0, t=0, b=150),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(title_font_size=28/3, tickfont_size=28/3)
fig.update_yaxes(title_font_size=28/3, tickfont_size=28/3)
fig.show()
fig.write_image("anova.svg")