In [1]:
import numpy as np
from scipy.optimize import curve_fit
import scipy.stats
import plotly.graph_objects as pg
import plotly.express as px
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2
import dt4dds.analysis as analysis

data = analysis.GroupAnalysis([
    ('Genscript', analysis.ErrorAnalysis("../../data/phix/Genscript_GCfix/Genscript_GCfix_aging")),
    ('Genscript', analysis.ErrorAnalysis("../../data/phix/Genscript_GCfix/Genscript_GCfix_PCR")),
    ('Genscript', analysis.ErrorAnalysis("../../data/phix/Genscript_GCall/Genscript_GCall_aging", skip=[(84, 86), (12, 16)])),
    ('Genscript', analysis.ErrorAnalysis("../../data/phix/Genscript_GCall/Genscript_GCall_PCR", skip=[(84, 86), (12, 16)])),
    ('Twist', analysis.ErrorAnalysis("../../data/phix/Twist/Twist_GCall_PCR")),
    ('Twist', analysis.ErrorAnalysis("../../data/phix/Twist/Twist_GCfix_PCR")),
    ('Twist', analysis.ErrorAnalysis("../../data/phix/Twist/Twist_GCall_aging")),
    ('Twist', analysis.ErrorAnalysis("../../data/phix/Twist/Twist_GCfix_aging")),
])

In [3]:
plot_data = data.data['substitutions_by_refposition_by_type'].copy()

def wavg(group):
    d = {}
    d['mean'] = group.rate.mean()
    d['std'] = group.rate.std()
    d['count'] = group.rate.count()
    return pd.Series(d, index=['mean', 'std', 'count'])


df_aggregate = plot_data.groupby(['type', 'read', 'position'], as_index=False).apply(wavg)
df_aggregate[['from_base', 'to_base']] = df_aggregate["type"].str.split('2', expand=True)

df_aggregate

Unnamed: 0,type,read,position,mean,std,count,from_base,to_base
0,A2C,1,0,0.000032,0.000021,7.0,A,C
1,A2C,1,1,0.000014,0.000007,2.0,A,C
2,A2C,1,2,0.000013,0.000006,2.0,A,C
3,A2C,1,3,0.000023,0.000021,2.0,A,C
4,A2C,1,4,0.000011,0.000002,2.0,A,C
...,...,...,...,...,...,...,...,...
2752,T2G,2,113,0.000580,0.000166,2.0,T,G
2753,T2G,2,114,0.000578,0.000204,2.0,T,G
2754,T2G,2,115,0.000532,0.000245,2.0,T,G
2755,T2G,2,116,0.000512,0.000217,2.0,T,G


fill in placeholder data for non-errors (e.g. A2A) so that diagonals are shown in the plot

In [5]:
fillerdf = pd.DataFrame({
    "type": ["A2A", "C2C", "G2G", "T2T"],
    "read": [1, 1, 1, 1],
    "position": [1, 1, 1, 1],
    "mean": [1.0, 1.0, 1.0, 1.0],
    "std": [0.0, 0.0, 0.0, 0.0],
    "count": [1.0, 1.0, 1.0, 1.0],
    "from_base": ["A", "C", "G", "T"],
    "to_base": ["A", "C", "G", "T"],
})
df_aggregate = pd.concat([df_aggregate, fillerdf], ignore_index=True)

In [6]:
fig = px.scatter(
    df_aggregate,
    x="position",
    y=df_aggregate["mean"]*1e3,
    color="read",
    facet_col="from_base",
    facet_row="to_base",
    category_orders={'from_base': ["A", "C", "G", "T"], 'to_base': ["A", "C", "G", "T"]}
)

fig.update_traces(marker=dict(size=3))

fig.update_xaxes(
    title='Position',
    row=1,
)
fig.update_xaxes(
    minor_ticks="outside", 
    minor_dtick=25,
    title_font_family="Inter",
    title_font_size=28/3, 
    tickfont_size=28/3
)

fig.update_yaxes(
    title='Substitution rate / 10<sup>-3</sup> nt<sup>-1</sup>',
    col=1
)
fig.update_yaxes(
    range=[0, 1.5],
    dtick=0.5,
    minor_ticks="outside", 
    minor_dtick=0.25,
    title_font_family="Inter",
    title_font_size=28/3, 
    tickfont_size=28/3,
)

fig.update_layout(
    template='simple_white',
    height=660,
    width=660,
    showlegend=False,
    margin=dict(l=0, r=20, t=20, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)

fig.show()
fig.write_image("seq_errorrate.svg")