In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as pg

def anova(idata, errortype):
    df = idata.loc[idata.type == errortype]
    model = smf.ols('val ~ C(synthesis) + PCRC + Decay', data=df).fit()
    display(model.summary())
    aov = sm.stats.anova_lm(model, typ=2, robust='hc3')
    aov['eta_sq'] = aov[:]['sum_sq']/sum(aov['sum_sq'])
    aov2 = aov.copy()
    aov2['eta_sq'] = [f"{i*100:0.1f}%" for i in aov['eta_sq']]
    display(aov2)
    return aov

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript_GCfix_36_0', analysis.ErrorAnalysis("../data/Aging/0a_Genscript_GCfix")),
    ('Genscript_GCfix_42_0', analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCfix")),
    ('Genscript_GCfix_42_2', analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCfix")),
    ('Genscript_GCfix_42_4', analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCfix")),
    ('Genscript_GCfix_42_7', analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCfix")),
    ('Genscript_GCfix_10_0', analysis.ErrorAnalysis("../data/PCR/10c_Genscript_GCfix")),
    ('Genscript_GCfix_15_0', analysis.ErrorAnalysis("../data/PCR/15c_Genscript_GCfix")),
    ('Genscript_GCfix_20_0', analysis.ErrorAnalysis("../data/PCR/20c_Genscript_GCfix")),
    ('Genscript_GCfix_25_0', analysis.ErrorAnalysis("../data/PCR/25c_Genscript_GCfix")),
    ('Genscript_GCall_36_0', analysis.ErrorAnalysis("../data/Aging/0a_Genscript_GCall")),
    ('Genscript_GCall_42_0', analysis.ErrorAnalysis("../data/Aging/0b_Genscript_GCall")),
    ('Genscript_GCall_42_2', analysis.ErrorAnalysis("../data/Aging/2d_Genscript_GCall")),
    ('Genscript_GCall_42_4', analysis.ErrorAnalysis("../data/Aging/4d_Genscript_GCall")),
    ('Genscript_GCall_42_7', analysis.ErrorAnalysis("../data/Aging/7d_Genscript_GCall")),
    ('Genscript_GCall_15_0', analysis.ErrorAnalysis("../data/PCR/15c_Genscript_GCall")),
    ('Genscript_GCall_20_0', analysis.ErrorAnalysis("../data/PCR/20c_Genscript_GCall")),
    ('Genscript_GCall_25_0', analysis.ErrorAnalysis("../data/PCR/25c_Genscript_GCall")),
    ('Genscript_GCall_30_0', analysis.ErrorAnalysis("../data/PCR/30c_Genscript_GCall")),
    ('Twist_GCall_15_0', analysis.ErrorAnalysis("../data/PCR/15c_Twist_GCall")),
    ('Twist_GCall_30_0', analysis.ErrorAnalysis("../data/PCR/30c_Twist_GCall")),
    ('Twist_GCall_45_0', analysis.ErrorAnalysis("../data/PCR/45c_Twist_GCall")),
    ('Twist_GCall_60_0', analysis.ErrorAnalysis("../data/PCR/60c_Twist_GCall")),
    ('Twist_GCall_75_0', analysis.ErrorAnalysis("../data/PCR/75c_Twist_GCall")),
    ('Twist_GCall_90_0', analysis.ErrorAnalysis("../data/PCR/90c_Twist_GCall")),
    ('Twist_GCall_37_0', analysis.ErrorAnalysis("../data/Aging/0a_Twist_GCall")),
    ('Twist_GCall_43_0', analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCall")),
    ('Twist_GCall_43_2', analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCall")),
    ('Twist_GCall_43_4', analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCall")),
    ('Twist_GCall_43_7', analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCall")),
    ('Twist_GCfix_15_0', analysis.ErrorAnalysis("../data/PCR/15c_Twist_GCfix")),
    ('Twist_GCfix_30_0', analysis.ErrorAnalysis("../data/PCR/30c_Twist_GCfix")),
    ('Twist_GCfix_45_0', analysis.ErrorAnalysis("../data/PCR/45c_Twist_GCfix")),
    ('Twist_GCfix_60_0', analysis.ErrorAnalysis("../data/PCR/60c_Twist_GCfix")),
    ('Twist_GCfix_75_0', analysis.ErrorAnalysis("../data/PCR/75c_Twist_GCfix")),
    ('Twist_GCfix_90_0', analysis.ErrorAnalysis("../data/PCR/90c_Twist_GCfix")),
    ('Twist_GCfix_39_0', analysis.ErrorAnalysis("../data/Aging/0a_Twist_GCfix")),
    ('Twist_GCfix_45_0', analysis.ErrorAnalysis("../data/Aging/0b_Twist_GCfix")),
    ('Twist_GCfix_45_2', analysis.ErrorAnalysis("../data/Aging/2d_Twist_GCfix")),
    ('Twist_GCfix_45_4', analysis.ErrorAnalysis("../data/Aging/4d_Twist_GCfix")),
    ('Twist_GCfix_45_7', analysis.ErrorAnalysis("../data/Aging/7d_Twist_GCfix")),
])

In [3]:
def summary(group):
    d = {}
    d['mean'] = group.val.mean()
    d['std'] = group.val.std()
    d['min'] = group.val.min()
    d['max'] = group.val.max()
    return pd.Series(d, index=['mean', 'std', 'min', 'max'])

errordata = data.data['overall_error_rates'].copy()
errordata.rename(columns={'rate': 'val'}, inplace=True)
errordata[['synthesis', 'GC', 'PCRC', 'Decay']] = errordata.group.str.split('_', expand=True)
errordata = errordata.astype({'PCRC': 'int', 'Decay': 'int'})
display(errordata.groupby(['type']).apply(summary))
display(errordata.groupby(['synthesis', 'type']).apply(summary))

Unnamed: 0_level_0,mean,std,min,max
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
deletions,0.006651,0.006948,0.000391,0.017075
delevents,0.005544,0.005849,0.000217,0.013984
insertions,0.00028,0.000234,4.9e-05,0.000637
substitutions,0.007906,0.001956,0.003636,0.012487


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min,max
synthesis,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Genscript,deletions,0.014084,0.002371,0.010155,0.017075
Genscript,delevents,0.011854,0.001658,0.008989,0.013984
Genscript,insertions,0.000528,9.1e-05,0.000415,0.000637
Genscript,substitutions,0.008025,0.001631,0.004396,0.010994
Twist,deletions,0.00057,0.000102,0.000391,0.000789
Twist,delevents,0.000381,9.5e-05,0.000217,0.00059
Twist,insertions,7.7e-05,1.5e-05,4.9e-05,0.000114
Twist,substitutions,0.007808,0.002201,0.003636,0.012487


In [4]:
del_table = anova(errordata, 'deletions')

0,1,2,3
Dep. Variable:,val,R-squared:,0.949
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,470.0
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,5.9e-49
Time:,10:14:04,Log-Likelihood:,403.45
No. Observations:,80,AIC:,-798.9
Df Residuals:,76,BIC:,-789.4
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0144,0.000,32.530,0.000,0.014,0.015
C(synthesis)[T.Twist],-0.0133,0.000,-32.961,0.000,-0.014,-0.013
PCRC,-1.012e-05,1.1e-05,-0.920,0.360,-3.2e-05,1.18e-05
Decay,-3.1e-08,7.91e-05,-0.000,1.000,-0.000,0.000

0,1,2,3
Omnibus:,0.91,Durbin-Watson:,0.275
Prob(Omnibus):,0.634,Jarque-Bera (JB):,0.653
Skew:,-0.221,Prob(JB):,0.721
Kurtosis:,3.03,Cond. No.,112.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),0.00239659,1.0,933.7566,1.9378749999999998e-44,92.3%
PCRC,4.617652e-06,1.0,1.799124,0.183813,0.2%
Decay,3.336901e-13,1.0,1.30012e-07,0.9997133,0.0%
Residual,0.0001950624,76.0,,,7.5%


In [5]:
sub_table = anova(errordata, 'substitutions')

0,1,2,3
Dep. Variable:,val,R-squared:,0.888
Model:,OLS,Adj. R-squared:,0.884
Method:,Least Squares,F-statistic:,200.9
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,4.91e-36
Time:,10:14:04,Log-Likelihood:,473.51
No. Observations:,80,AIC:,-939.0
Df Residuals:,76,BIC:,-929.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0044,0.000,24.010,0.000,0.004,0.005
C(synthesis)[T.Twist],-0.0020,0.000,-11.766,0.000,-0.002,-0.002
PCRC,0.0001,4.58e-06,23.725,0.000,9.96e-05,0.000
Decay,0.0001,3.29e-05,3.470,0.001,4.87e-05,0.000

0,1,2,3
Omnibus:,0.799,Durbin-Watson:,0.969
Prob(Omnibus):,0.671,Jarque-Bera (JB):,0.322
Skew:,0.075,Prob(JB):,0.851
Kurtosis:,3.272,Cond. No.,112.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),5.1e-05,1.0,115.403128,6.656245e-17,7.9%
PCRC,0.000557,1.0,1251.414821,5.878049e-49,86.2%
Decay,4e-06,1.0,9.375963,0.003039231,0.6%
Residual,3.4e-05,76.0,,,5.2%


In [6]:
ins_table = anova(errordata, 'insertions')

0,1,2,3
Dep. Variable:,val,R-squared:,0.932
Model:,OLS,Adj. R-squared:,0.929
Method:,Least Squares,F-statistic:,346.8
Date:,"Wed, 21 Jun 2023",Prob (F-statistic):,3.05e-44
Time:,10:14:05,Log-Likelihood:,663.31
No. Observations:,80,AIC:,-1319.0
Df Residuals:,76,BIC:,-1309.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0005,1.72e-05,29.914,0.000,0.000,0.001
C(synthesis)[T.Twist],-0.0005,1.57e-05,-29.106,0.000,-0.000,-0.000
PCRC,4.188e-07,4.27e-07,0.980,0.330,-4.32e-07,1.27e-06
Decay,1.432e-08,3.07e-06,0.005,0.996,-6.1e-06,6.13e-06

0,1,2,3
Omnibus:,2.465,Durbin-Watson:,0.197
Prob(Omnibus):,0.292,Jarque-Bera (JB):,1.556
Skew:,0.011,Prob(JB):,0.459
Kurtosis:,2.317,Cond. No.,112.0


Unnamed: 0,sum_sq,df,F,PR(>F),eta_sq
C(synthesis),3.224748e-06,1.0,832.765385,1.067362e-42,91.4%
PCRC,8.186384e-09,1.0,2.114068,0.1500682,0.2%
Decay,8.529131e-14,1.0,2.2e-05,0.9962677,0.0%
Residual,2.942976e-07,76.0,,,8.3%


In [7]:
sub_table['type'] = 'Subs.'
del_table['type'] = 'Dels.'
df = pd.concat([del_table, sub_table])


fig = px.bar(
    df, 
    x="type", 
    y="eta_sq", 
    color=df.index,
    color_discrete_sequence=["#252525", "#636363", "#969696", "#cccccc"],
)


fig.update_layout(
    template='simple_white',
    height=400,
    width=175,
    # showlegend=False,
    legend=dict(
        yanchor="top",
        y=-0.15,
        xanchor="left",
        x=-0.4,
        title_text=''
    )
)
fig.update_yaxes(title_text='Explained variance', tickformat=",.0%", range=[0, 1])
fig.update_xaxes(title_text='')
fig.update_layout(
    height=375,
    width=125,
    margin=dict(l=0, r=0, t=0, b=150),
    font_family="Inter",
    legend_font_size=28/3,
)
fig.update_xaxes(title_font_size=28/3, tickfont_size=28/3)
fig.update_yaxes(title_font_size=28/3, tickfont_size=28/3)
fig.show()
fig.write_image("anova.svg")