In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import plotly.express as px
import plotly.graph_objects as pg
from scipy.optimize import curve_fit

import dt4dds.analysis.dataaggregation as analysis

import sys
sys.path.append('..')
import plotting

In [None]:
error_colors = {
    'substitutions': '#e6550d',
    'insertions': '#3182bd',
    'deletions': '#756bb1'
}

In [None]:
data = analysis.GroupAnalysis([
    ('Meiser_unaged', analysis.ErrorAnalysis("../data_experimental/Aging_Meiser/unaged/analysis", local=True, paired=False)),
    ('Meiser_aged', analysis.ErrorAnalysis("../data_experimental/Aging_Meiser/aged/analysis", local=True, paired=False)),
    ('Meiser_repaired', analysis.ErrorAnalysis("../data_experimental/Aging_Meiser/repaired/analysis", local=True, paired=False)),
    ('Song_0d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/0d/analysis", local=True, paired=False)),
    ('Song_28d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/28d/analysis", local=True, paired=False)),
    ('Song_56d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/56d/analysis", local=True, paired=False)),
    ('Song_70d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/70d/analysis", local=True, paired=False)),
])
order = ['Meiser_unaged', 'Meiser_aged', 'Meiser_repaired', 'Song_0d', 'Song_28d', 'Song_56d', 'Song_70d']

# Collect median error rates from positional error data

In [None]:
mean_error_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        mean_error_rates[(errortype, group)] = group_data.rate.median()

mean_error_rates = pd.Series(mean_error_rates).to_frame('rate').reset_index(names=['errortype', 'group'])
mean_error_rates[['dataset', 'exp']] = mean_error_rates['group'].str.split('_', expand=True)

mean_error_rates

In [None]:
fig = px.bar(
    mean_error_rates, 
    x='exp', 
    y='rate', 
    color='errortype', 
    color_discrete_map=error_colors,
    category_orders={'dataset': order}
)

fig.update_xaxes(tickangle=90)
fig.update_yaxes(dtick=0.005, minor_dtick=0.0025)
fig.update_layout(
    yaxis_title='Error rate per nt',
    xaxis_title='',
    margin=dict(l=0, r=10, t=10, b=0),
    width=210,
    height=150,
    showlegend=False,
)


fig = plotting.standardize_plot(fig)
fig.write_image('./figures/aging_error_rates.svg')
fig.show()

# Again with simulated data

In [None]:
data = analysis.GroupAnalysis([
    ('Meiser_unaged', analysis.ErrorAnalysis("../data_experimental/Aging_Meiser/unaged/analysis", local=True, paired=False)),
    ('Meiser_aged', analysis.ErrorAnalysis("../data_experimental/Aging_Meiser/aged/analysis", local=True, paired=False)),
    ('Meiser_repaired', analysis.ErrorAnalysis("../data_experimental/Aging_Meiser/repaired/analysis", local=True, paired=False)),
    ('Song_0d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/0d/analysis", local=True, paired=False)),
    ('Song_28d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/28d/analysis", local=True, paired=False)),
    ('Song_56d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/56d/analysis", local=True, paired=False)),
    ('Song_70d', analysis.ErrorAnalysis("../data_experimental/Aging_Song/70d/analysis", local=True, paired=False)),
    ('simulated_simulated', analysis.ErrorAnalysis("../data_simulated/test_decay/analysis", local=True, paired=False)),
])
order = ['Meiser_unaged', 'Meiser_aged', 'Meiser_repaired', 'Song_0d', 'Song_28d', 'Song_56d', 'Song_70d', 'simulated_simulated']

In [None]:
mean_error_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        mean_error_rates[(errortype, group)] = group_data.rate.median()

mean_error_rates = pd.Series(mean_error_rates).to_frame('rate').reset_index(names=['errortype', 'group'])
mean_error_rates[['dataset', 'exp']] = mean_error_rates['group'].str.split('_', expand=True)

mean_error_rates

In [None]:
fig = px.bar(
    mean_error_rates, 
    x='exp', 
    y='rate', 
    color='errortype', 
    color_discrete_map=error_colors,
    category_orders={'dataset': order}
)

fig.update_xaxes(tickangle=90)
fig.update_yaxes(dtick=0.005, minor_dtick=0.0025)
fig.update_layout(
    yaxis_title='Error rate per nt',
    xaxis_title='',
    margin=dict(l=0, r=10, t=10, b=0),
    width=210,
    height=150,
    showlegend=False,
)


fig = plotting.standardize_plot(fig)
fig.write_image('./SI_figures/aging_error_rates.svg')
fig.show()

In [None]:
mean_error_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition_by_type'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        rates = group_data.groupby("type").rate.median()
        mean_error_rates[(errortype, group)] = rates/np.sum(rates)

# convert to dataframe
idf = pd.DataFrame(mean_error_rates)

# go from wide to long format
idf = idf.stack(future_stack=True).reset_index()
idf[['dataset', 'exp']] = idf['level_1'].str.split('_', expand=True)

# remove all rows where the substitution includes N
idf = idf[~idf.type.str.contains('N')]

idf

In [None]:
color_map = {
    'A': '#31a354',
    'A2C': '#74c476',
    'A2G': '#31a354',
    'A2T': '#006d2c',
    'C': '#3182bd',
    'C2A': '#6baed6',
    'C2G': '#3182bd',
    'C2T': '#08519c',
    'G': '#fd8d3c',
    'G2A': '#fdbe85',
    'G2C': '#fd8d3c',
    'G2T': '#e6550d',
    'T': '#de2d26',
    'T2A': '#fb6a4a',
    'T2C': '#de2d26',
    'T2G': '#a50f15',
}



for errortype in ["substitutions", "deletions", "insertions"]:
    fig = px.bar(
        idf,
        x='exp',
        y=errortype,
        color='type',
        color_discrete_map=color_map,
    )
    fig.update_yaxes(title_text=f'Ratio of {errortype}', tickformat=".0%", range=[0, 1])
    fig.update_layout(
        width=320,
        height=250,
        margin=dict(l=0, r=10, t=10, b=0),
        showlegend=False,
        xaxis_title='',
    )

    fig = plotting.standardize_plot(fig)
    fig.show()