In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import plotly.graph_objects as pg
from plotly.subplots import make_subplots
import statsmodels.stats.descriptivestats as ds

import dt4dds.analysis.dataaggregation as analysis

import sys
sys.path.append('..')
import plotting

In [None]:
error_colors = {
    'substitutions': '#e6550d',
    'insertions': '#3182bd',
    'deletions': '#756bb1'
}

In [None]:
data = analysis.GroupAnalysis([
    ('Lietard_Normal', analysis.ErrorAnalysis("../data_experimental/Photolithographic_Lietard/normal/analysis", local=True, paired=False)),
    ('Lietard_Capped', analysis.ErrorAnalysis("../data_experimental/Photolithographic_Lietard/capped/analysis", local=True, paired=False)),
    ('Lietard_Spaced', analysis.ErrorAnalysis("../data_experimental/Photolithographic_Lietard/spaced/analysis", local=True, paired=False)),
    ('Lietard_highdensity', analysis.ErrorAnalysis("../data_experimental/Photolithographic_Lietard/high_density/analysis", local=True, paired=False)),
    ('Antkowiak_File1', analysis.ErrorAnalysis("../data_experimental/Photolithographic_Antkowiak/File1/analysis", local=True, paired=False)),
])
order = ['Lietard_Normal', 'Lietard_Capped', 'Lietard_Spaced', 'Lietard_highdensity', 'Antkowiak_File1']

# Median error rates

In [None]:
mean_error_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        mean_error_rates[(errortype, group)] = group_data.rate.median()

mean_error_rates = pd.Series(mean_error_rates).to_frame('rate').reset_index(names=['errortype', 'group'])

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.rate)
    d['mean'] = stats.loc['mean', 'rate']
    d['std'] = stats.loc['std_err', 'rate']
    return pd.Series(d, index=['mean', 'std'])
mean_error_rates = mean_error_rates.groupby(['errortype'], as_index=False).apply(summary, include_groups=False)

mean_error_rates

# Median rates of error events

In [None]:
mean_error_event_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        mean_error_rate = group_data.rate.median()

        # calculate the mean length per event
        lengths = data.data[f"error_frequency_by_length"]
        lengths = lengths[lengths.group == group].copy()
        lengths = lengths[lengths['type'] == 'substitutions'].groupby('length')['value'].mean()
        mean_length = np.sum(lengths.index * lengths) / np.sum(lengths)

        # get the mean error event rate
        mean_error_event_rates[(errortype, group)] = mean_error_rate / mean_length

mean_error_event_rates = pd.Series(mean_error_event_rates).to_frame('rate').reset_index(names=['errortype', 'group'])

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.rate)
    d['mean'] = stats.loc['mean', 'rate']
    d['std'] = stats.loc['std_err', 'rate']
    return pd.Series(d, index=['mean', 'std'])
mean_error_event_rates = mean_error_event_rates.groupby(['errortype'], as_index=False).apply(summary, include_groups=False)

mean_error_event_rates

# Consecutive errors

In [None]:
MAX_LENGTH = 4


fig = make_subplots(
    rows=2, 
    shared_xaxes=True, 
    vertical_spacing=0.05,
    row_heights=[0.3, 0.7],
)

# get data for errors by length
idata = data.data[f"error_frequency_by_length"].copy()

# clip to maximum length, sum up all errors with length >= MAX_LENGTH
idata.length = idata.length.clip(upper=MAX_LENGTH)
newframe = idata.loc[idata.length == MAX_LENGTH].groupby(['type', 'exp', 'read', 'group'])['value'].sum().reset_index().copy()
newframe['length'] = MAX_LENGTH
idata.drop(idata.loc[idata['length'] == MAX_LENGTH].index, inplace=True)
idata = pd.concat([idata, newframe], ignore_index=True)

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.value)
    d['mean'] = stats.loc['mean', 'value']
    d['std'] = stats.loc['std_err', 'value']
    return pd.Series(d, index=['mean', 'std'])
df_aggregate = idata.groupby(['type', 'length'], as_index=False).apply(summary, include_groups=False)


fig = make_subplots(
    rows=2, 
    shared_xaxes=True, 
    vertical_spacing=0.05,
    row_heights=[0.3, 0.7],
)

for error_type in ['substitutions', 'insertions', 'deletions']:
    this_data = df_aggregate.loc[df_aggregate['type'] == error_type].copy()
    fig.add_trace(
        pg.Bar(
            x=this_data['length'],
            y=this_data['mean'],
            error_y=dict(
                type='data',
                array=this_data['std'],
                color='#222222',
                visible=True,
                thickness=1.5,
            ),
            marker_color=error_colors[error_type]
        ),
        col=1,
        row=1
    )
    fig.add_trace(
        pg.Bar(
            x=this_data['length'],
            y=this_data['mean'],
            error_y=dict(
                type='data',
                array=this_data['std'],
                color='#222222',
                visible=True,
                thickness=1.5,
            ),
            marker_color=error_colors[error_type]
        ),
        col=1,
        row=2
    )

fig.update_layout(
    template='simple_white',
    height=200,
    width=250,
    showlegend=False,
    barmode='group',
    margin=dict(l=50, r=0, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)

ticktext = list(map(str, range(1, MAX_LENGTH+1)))
ticktext[-1] += '+'

fig.update_xaxes(
    showticklabels=False, 
    visible=False, 
    range=[0.5, MAX_LENGTH+0.5],
    tickmode='array',
    tickvals=list(range(1, MAX_LENGTH+1)),
    ticktext=ticktext,
)
fig.update_xaxes(title="Length of consecutive errors", showticklabels=True, visible=True, row=2, col=1)
fig.update_xaxes(showticklabels=True, visible=True, row=2, col=2)

fig.update_yaxes(tickformat=",.0%")
fig.update_yaxes(title="Fraction of errors", range=[0, 0.17], row=2, dtick=0.05, minor_dtick=0.025)
fig.update_yaxes(range=[0.725, 1.0], row=1, dtick=0.2, minor_dtick=0.1)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./figures/error_distribution_by_length.svg")


# compare to the theoretical model of geometric distribution
df_aggregate['theory'] = 0.0
for errortype in mean_error_rates.errortype.unique():
    ratios = scipy.stats.geom.pmf(np.arange(1, MAX_LENGTH+1), 1-mean_error_rates.loc[mean_error_rates.errortype == errortype, 'mean'])
    ratios[-1] = 1-np.sum(ratios[0:-1])
    df_aggregate.loc[(df_aggregate['type'] == errortype) & (df_aggregate['length'] <= MAX_LENGTH), 'theory'] = ratios
df_aggregate

# Errors per read

In [None]:
MAX_ERRORS = 6
N_BASES_PER_READ = 60

# get data for errors by read
idata = data.data[f"error_frequency_by_read"].copy()
idata = idata.loc[idata.type.isin(['substitutions', 'insertions', 'deletions'])]

# clip to maximum number, sum up all errors with length >= MAX_ERRORS
idata.frequency = idata.frequency.clip(upper=MAX_ERRORS)
newframe = idata.loc[idata.frequency == MAX_ERRORS].groupby(['type', 'exp', 'read', 'group'])['value'].sum().reset_index()
newframe['frequency'] = MAX_ERRORS
idata.drop(idata.loc[idata['frequency'] == MAX_ERRORS].index, inplace=True)
idata = pd.concat([idata, newframe], ignore_index=True)

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.value)
    d['mean'] = stats.loc['mean', 'value']
    d['std'] = stats.loc['std_err', 'value']
    return pd.Series(d, index=['mean', 'std'])
df_aggregate = idata.groupby(['type', 'frequency'], as_index=False).apply(summary, include_groups=False)


fig = pg.Figure()

for error_type in ['substitutions', 'insertions', 'deletions']:

    this_data = df_aggregate.loc[df_aggregate['type'] == error_type].copy()
    fig.add_trace(
        pg.Bar(
            x=this_data['frequency'],
            y=this_data['mean'],
            error_y=dict(
                type='data',
                array=this_data['std'],
                color='#222222',
                visible=True,
                thickness=1.5,
            ),
            marker_color=error_colors[error_type]
        )
    )


fig.update_layout(
    template='simple_white',
    height=200,
    width=400,
    showlegend=False,
    barmode='group',
    margin=dict(l=50, r=0, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)

ticktext = list(map(str, range(0, MAX_ERRORS+1)))
ticktext[-1] += '+'

fig.update_xaxes(
    title="Number of errors in read",
    range=[-0.5, MAX_ERRORS+0.5],
    tickmode = 'array',
    tickvals = list(range(0, MAX_ERRORS+1)),
    ticktext = ticktext,
)

fig.update_yaxes(
    title="Fraction of reads", 
    tickformat=",.0%", 
    range=[0, 0.65], 
    dtick=0.25,
    title_font_family="Inter",
    title_font_size=28/3, 
    tickfont_size=28/3
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./figures/error_distribution_by_read.svg")


# compare to the theoretical model of geometric distribution
df_aggregate['theory'] = 0.0
for errortype in mean_error_event_rates.errortype.unique():
    ratios = scipy.stats.binom.pmf(np.arange(0, MAX_ERRORS+1), N_BASES_PER_READ, mean_error_event_rates.loc[mean_error_event_rates.errortype == errortype, 'mean'])
    ratios[-1] = 1-np.sum(ratios[0:-1])
    df_aggregate.loc[(df_aggregate['type'] == errortype) & (df_aggregate['frequency'] <= MAX_ERRORS), 'theory'] = ratios
df_aggregate

# Redo for simulated data

In [None]:
data = analysis.GroupAnalysis([
    ('simulated_simulated', analysis.ErrorAnalysis("../data_simulated/test_photolithography/analysis", local=True, paired=False)),
])
order = ['simulated_simulated']

In [None]:
mean_error_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        mean_error_rates[(errortype, group)] = group_data.rate.median()

mean_error_rates = pd.Series(mean_error_rates).to_frame('rate').reset_index(names=['errortype', 'group'])

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.rate)
    d['mean'] = stats.loc['mean', 'rate']
    d['std'] = stats.loc['std_err', 'rate']
    return pd.Series(d, index=['mean', 'std'])
mean_error_rates = mean_error_rates.groupby(['errortype'], as_index=False).apply(summary, include_groups=False)

mean_error_rates

In [None]:
mean_error_event_rates = {}

for errortype in ['substitutions', 'insertions', 'deletions']:
    
    idata = data.data[f'{errortype}_by_refposition'].copy()

    for group in idata.group.unique():
        group_data = idata[idata.group == group]
        mean_error_rate = group_data.rate.median()

        # calculate the mean length per event
        lengths = data.data[f"error_frequency_by_length"]
        lengths = lengths[lengths.group == group].copy()
        lengths = lengths[lengths['type'] == 'substitutions'].groupby('length')['value'].mean()
        mean_length = np.sum(lengths.index * lengths) / np.sum(lengths)

        # get the mean error event rate
        mean_error_event_rates[(errortype, group)] = mean_error_rate / mean_length

mean_error_event_rates = pd.Series(mean_error_event_rates).to_frame('rate').reset_index(names=['errortype', 'group'])

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.rate)
    d['mean'] = stats.loc['mean', 'rate']
    d['std'] = stats.loc['std_err', 'rate']
    return pd.Series(d, index=['mean', 'std'])
mean_error_event_rates = mean_error_event_rates.groupby(['errortype'], as_index=False).apply(summary, include_groups=False)

mean_error_event_rates

In [None]:
MAX_LENGTH = 4


fig = make_subplots(
    rows=2, 
    shared_xaxes=True, 
    vertical_spacing=0.05,
    row_heights=[0.3, 0.7],
)

# get data for errors by length
idata = data.data[f"error_frequency_by_length"].copy()

# clip to maximum length, sum up all errors with length >= MAX_LENGTH
idata.length = idata.length.clip(upper=MAX_LENGTH)
newframe = idata.loc[idata.length == MAX_LENGTH].groupby(['type', 'exp', 'read', 'group'])['value'].sum().reset_index().copy()
newframe['length'] = MAX_LENGTH
idata.drop(idata.loc[idata['length'] == MAX_LENGTH].index, inplace=True)
idata = pd.concat([idata, newframe], ignore_index=True)

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.value)
    d['mean'] = stats.loc['mean', 'value']
    d['std'] = stats.loc['std_err', 'value']
    return pd.Series(d, index=['mean', 'std'])
df_aggregate = idata.groupby(['type', 'length'], as_index=False).apply(summary, include_groups=False)


fig = make_subplots(
    rows=2, 
    shared_xaxes=True, 
    vertical_spacing=0.05,
    row_heights=[0.3, 0.7],
)

for error_type in ['substitutions', 'insertions', 'deletions']:
    this_data = df_aggregate.loc[df_aggregate['type'] == error_type].copy()
    fig.add_trace(
        pg.Bar(
            x=this_data['length'],
            y=this_data['mean'],
            marker_color=error_colors[error_type]
        ),
        col=1,
        row=1
    )
    fig.add_trace(
        pg.Bar(
            x=this_data['length'],
            y=this_data['mean'],
            marker_color=error_colors[error_type]
        ),
        col=1,
        row=2
    )

fig.update_layout(
    template='simple_white',
    height=200,
    width=250,
    showlegend=False,
    barmode='group',
    margin=dict(l=50, r=0, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)

ticktext = list(map(str, range(1, MAX_LENGTH+1)))
ticktext[-1] += '+'

fig.update_xaxes(
    showticklabels=False, 
    visible=False, 
    range=[0.5, MAX_LENGTH+0.5],
    tickmode='array',
    tickvals=list(range(1, MAX_LENGTH+1)),
    ticktext=ticktext,
)
fig.update_xaxes(title="Length of consecutive errors", showticklabels=True, visible=True, row=2, col=1)
fig.update_xaxes(showticklabels=True, visible=True, row=2, col=2)

fig.update_yaxes(tickformat=",.0%")
fig.update_yaxes(title="Fraction of errors", range=[0, 0.17], row=2, dtick=0.05, minor_dtick=0.025)
fig.update_yaxes(range=[0.725, 1.0], row=1, dtick=0.2, minor_dtick=0.1)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figures/error_distribution_by_length.svg")

In [None]:
MAX_ERRORS = 6
N_BASES_PER_READ = 60

# get data for errors by read
idata = data.data[f"error_frequency_by_read"].copy()
idata = idata.loc[idata.type.isin(['substitutions', 'insertions', 'deletions'])]

# clip to maximum number, sum up all errors with length >= MAX_ERRORS
idata.frequency = idata.frequency.clip(upper=MAX_ERRORS)
newframe = idata.loc[idata.frequency == MAX_ERRORS].groupby(['type', 'exp', 'read', 'group'])['value'].sum().reset_index()
newframe['frequency'] = MAX_ERRORS
idata.drop(idata.loc[idata['frequency'] == MAX_ERRORS].index, inplace=True)
idata = pd.concat([idata, newframe], ignore_index=True)

# average over the experiments across error types
def summary(group):
    d = {}
    stats = ds.describe(group.value)
    d['mean'] = stats.loc['mean', 'value']
    d['std'] = stats.loc['std_err', 'value']
    return pd.Series(d, index=['mean', 'std'])
df_aggregate = idata.groupby(['type', 'frequency'], as_index=False).apply(summary, include_groups=False)


fig = pg.Figure()

for error_type in ['substitutions', 'insertions', 'deletions']:

    this_data = df_aggregate.loc[df_aggregate['type'] == error_type].copy()
    fig.add_trace(
        pg.Bar(
            x=this_data['frequency'],
            y=this_data['mean'],
            marker_color=error_colors[error_type]
        )
    )


fig.update_layout(
    template='simple_white',
    height=200,
    width=400,
    showlegend=False,
    barmode='group',
    margin=dict(l=50, r=0, t=5, b=0),
    font_family="Inter",
    legend_font_size=28/3,
)

ticktext = list(map(str, range(0, MAX_ERRORS+1)))
ticktext[-1] += '+'

fig.update_xaxes(
    title="Number of errors in read",
    range=[-0.5, MAX_ERRORS+0.5],
    tickmode = 'array',
    tickvals = list(range(0, MAX_ERRORS+1)),
    ticktext = ticktext,
)

fig.update_yaxes(
    title="Fraction of reads", 
    tickformat=",.0%", 
    range=[0, 0.65], 
    dtick=0.25,
    title_font_family="Inter",
    title_font_size=28/3, 
    tickfont_size=28/3
)

fig = plotting.standardize_plot(fig)
fig.show()
fig.write_image("./SI_figures/error_distribution_by_read.svg")