# Simulation experiment for detection thresholds for variant calling

This notebook is to get detection thresholds for the limit of the variant calling based on: 

1. Read depth
2. Error rate
3. Number of mutations


Also optionaly get the time it takes to complete experiments per sequence length.


```
>lcl|NC_000913.3_cds_NP_414902.1_362 [gene=tauD] [locus_tag=b0368] [db_xref=UniProtKB/Swiss-Prot:P37610] [protein=taurine dioxygenase] [protein_id=NP_414902.1] [location=387795..388646] [gbkey=CDS]
```

From: ecoli_GCF_000005845.2_ASM584v2_cds_from_genomic.fna
and >NP_414902.1 taurine dioxygenase [Escherichia coli str. K-12 substr. MG1655]



In [5]:
import pandas as pd
# Visualisation things to make the figures look nice
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sciutil import SciUtil


data_dir = 'ePCR_data/'
fig_dir = 'ePCR_figures/'


plt.rcParams['svg.fonttype'] = 'none'
axis_line_width = 1.0
axis_font_size = 12
title_font_size = 12
label_font_size = 10
figsize = (5, 4) # Figure size
font = 'Arial'
style = 'ticks'
font_family = 'sans-serif'

# Create a figure with 2x2 subplots
sns.set_style("whitegrid")
cmap = 'viridis'
palette = sns.color_palette("viridis", as_cmap=True)
sns.set_palette(cmap)

sns.set(rc={'figure.figsize': figsize, 'font.family': font_family,
            'font.sans-serif': font, 'font.size': label_font_size}, style=style)

def set_ax_params(ax):
    ax.tick_params(direction='out', length=2, width=axis_line_width)
    ax.spines['bottom'].set_linewidth(axis_line_width)
    ax.spines['top'].set_linewidth(0)
    ax.spines['left'].set_linewidth(axis_line_width)
    ax.spines['right'].set_linewidth(0)
    ax.tick_params(labelsize=axis_font_size)
    ax.tick_params(axis='x', which='major', pad=2.0)
    ax.tick_params(axis='y', which='major', pad=2.0)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    ax.tick_params(labelsize=label_font_size)


u = SciUtil()

# Generate mutations on a "real" sequence
# Decided to choose tauD from Ecoli K12
parent_sequence_aa = 'MSERLSITPLGPYIGAQISGADLTRPLSDNQFEQLYHAVLRHQVVFLRDQAITPQQQRALAQRFGELHIHPVYPHAEGVD\
EIIVLDTHNDNPPDNDNWHTDVTFIETPPAGAILAAKELPSTGGDTLWTSGIAAYEALSVPFRQLLSGLRAEHDFRKSFP\
EYKYRKTEEEHQRWREAVAKNPPLLHPVVRTHPVSGKQALFVNEGFTTRIVDVSEKESEALLSFLFAHITKPEFQVRWRW\
QPNDIAIWDNRVTQHYANADYLPQRRIMHRATILGDKPFYRAG'

parent_sequence = 'ATGAGTGAACGTCTGAGCATTACCCCGCTGGGGCCGTATATCGGCGCACAAATTTCGGGTGCCGACCTGACGCGCCCGTT\
AAGCGATAATCAGTTTGAACAGCTTTACCATGCGGTGCTGCGCCATCAGGTGGTGTTTCTACGCGATCAAGCTATTACGC\
CGCAGCAGCAACGCGCGCTGGCCCAGCGTTTTGGCGAATTGCATATTCACCCTGTTTACCCGCATGCCGAAGGGGTTGAC\
GAGATCATCGTGCTGGATACCCATAACGATAATCCGCCAGATAACGACAACTGGCATACCGATGTGACATTTATTGAAAC\
GCCACCCGCAGGGGCGATTCTGGCAGCTAAAGAGTTACCTTCGACCGGCGGTGATACGCTCTGGACCAGCGGTATTGCGG\
CCTATGAGGCGCTCTCTGTTCCCTTCCGCCAGCTGCTGAGTGGGCTGCGTGCGGAGCATGATTTCCGTAAATCGTTCCCG\
GAATACAAATACCGCAAAACCGAGGAGGAACATCAACGCTGGCGCGAGGCGGTCGCGAAAAACCCGCCGTTGCTACATCC\
GGTGGTGCGAACGCATCCGGTGAGCGGTAAACAGGCGCTGTTTGTGAATGAAGGCTTTACTACGCGAATTGTTGATGTGA\
GCGAGAAAGAGAGCGAAGCCTTGTTAAGTTTTTTGTTTGCCCATATCACCAAACCGGAGTTTCAGGTGCGCTGGCGCTGG\
CAACCAAATGATATTGCGATTTGGGATAACCGCGTGACCCAGCACTATGCCAATGCCGATTACCTGCCACAGCGACGGAT\
AATGCATCGGGCGACGATCCTTGGGGATAAACCGTTTTATCGGGCGGGGTAA'

# Introduce mutations at a given frequency and an error rate

Test setup:

1. For number of mutations from 1 to the sequence length test mutating each one and correlate this to the p value
2. For sequencing error rates from 0 to 100% make sequences ranging with this and see what the results are for the p value and the error
3. For different sequence lengths also check how sequence length corresponds to the pvalue

In [6]:
from minION import *
from tqdm import tqdm

def make_experiment(run_label, read_depth, sequencing_error_rate, parent_sequence, positions, library_number, 
                    number_of_wells, epcr_mutation_rate, frequency_cutoff=0.5):
    # Make a full experiment setup
    mutated_sequence = make_epcr_de_experiment(read_depth, sequencing_error_rate, parent_sequence, library_number,
                                                   epcr_mutation_rate)
    
    variant_df = get_dummy_plate_df(run_label, 'Well', number_of_wells)  # i.e. 20 - 1 since we don't have the other variant    
    mutant_to_well_df = {}
    current_well = 0
    for mutant in tqdm(mutated_sequence):
        parent_name = 'Parent'
        reads = []
        read_ids = []
        quals = []
        for i, seq in enumerate(mutated_sequence[mutant]):
            read_ids.append(f'read_{i}')
            reads.append(seq)
            quals.append(100)  # Dummy don't need
    
        well_df = make_well_df_from_reads(reads, read_ids, quals)
        rows_all = make_row_from_read_pileup_across_well(well_df, parent_sequence, parent_name)
        well_df = pd.DataFrame(rows_all)
        well_df.columns = ['gene_name', 'position', 'ref', 'most_frequent', 'freq_non_ref', 'total_other',
                           'total_reads', 'p_value', 'percent_most_freq_mutation', 'A', 'p(a)', 'T', 'p(t)', 'G',
                           'p(g)',
                           'C', 'p(c)', 'N', 'p(n)']
        well_df = calculate_mutation_significance_across_well(well_df)
        label, frequency, combined_p_value, mixed_well = get_variant_label_for_well(well_df, frequency_cutoff)
        # This should be mutated at 100% - the rate of our sequencing errror
        if verbose:
            u.dp(["label", label, f"frequency", frequency, f"combined_p_value", combined_p_value, "mixed_well", mixed_well])
        mutant_to_well_df[f'{mutant}_{current_well}'] = well_df
        variant_df.at[current_well, "Mixed Well"] = mixed_well
        variant_df.at[current_well, "Variant"] = label
        variant_df.at[current_well, "frequency"] = frequency
        variant_df.at[current_well, "P value"] = combined_p_value
        variant_df.at[current_well, "Well"] = f'Well {current_well}'
        variant_df.at[current_well, "Alignment_count"] = read_depth
        current_well += 1

    # Before returning adjust the pvalues
    variant_df['P adj.'] = multipletests(list(variant_df["P value"].values), alpha=0.05, method='fdr_bh')[1]
    return variant_df

## Experiment 1: Varying the sequencing error rate for a single mutation

In [None]:
# We're going to make an experiment of 10 plates with different sequencing error rates
read_depth = 100
number_of_wells = 19
epcr_mutation_rate = 0.02
frequency_cutoff=0.5
library_number = 96 # Usually do a 96 well plate
verbose = False
for position in [1, 20, 100]:
    experiment_df = pd.DataFrame()
    positions = [position]
    for sequencing_error in range(0, 100, 5):
        sequencing_error_rate = sequencing_error/100.0
        run_df = make_experiment(f'SeqError_{sequencing_error}', read_depth, sequencing_error_rate, parent_sequence, positions, 
                                 library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
        run_df.reset_index(inplace=True)
        experiment_df = pd.concat([experiment_df, run_df])

    # Also plot each one
    experiment_df.to_csv(f'{data_dir}Experiment1_position:{position}.csv', index=False)
    experiment_df['-log10(P.adj)'] = -1*np.log10(experiment_df['P adj.'].values)
    experiment_df['Error rate'] = [int(c.split('_')[-1])/100.0 for c in experiment_df['Plate']]
    experiment_df['# Mutations'] = [len(v.split('_')) for v in experiment_df['Variant'].values]
    experiment_df['# Mutations'] = [f'{v}' if v < 5 else f'>=5' for v in experiment_df['# Mutations']]
    experiment_df = experiment_df.reset_index()
    ax = sns.scatterplot(experiment_df, x='frequency', y='-log10(P.adj)', hue='Error rate', palette='viridis', 
                         style="# Mutations", style_order=['1', '2', '3', '4', '>=5'], s=80)
    set_ax_params(ax)
    plt.xlabel('Mutation frequency')
    plt.ylabel('-log10(padj)')
    plt.title('Effect of sequencing error on significance', fontsize=title_font_size, fontweight="bold")
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.savefig(f'{fig_dir}Experiment1_position:{position}.svg')
    plt.show()

100%|█| 96/96 [00:40<00:00,  2.38i
  variant_df.at[current_well, "frequency"] = frequency
100%|█| 96/96 [00:53<00:00,  1.80i
  variant_df.at[current_well, "frequency"] = frequency
100%|█| 96/96 [00:58<00:00,  1.65i
  variant_df.at[current_well, "frequency"] = frequency
 41%|▍| 39/96 [00:23<00:36,  1.58i

## Experiment 2: Varying the sequencing error rate for a multiple mutations

In [None]:
verbose = False
for positions in [[50, 14, 90]]:
    experiment_df = pd.DataFrame()
    positions = positions
    number_of_wells = (19*3)  # 19 aas * 3 positions
    for sequencing_error in range(0, 100, 10):
        sequencing_error_rate = sequencing_error/100.0
        run_df = make_experiment(f'SeqError_{sequencing_error}', read_depth, sequencing_error_rate, parent_sequence, positions, 
                                 library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
        run_df.reset_index(inplace=True)
        experiment_df = pd.concat([experiment_df, run_df])

    # Also plot each one
    positions = [str(s) for s in positions]
    experiment_df.to_csv(f'{data_dir}Experiment2_position:{"-".join(positions)}.csv', index=False)
    # experiment_df['-log10(P.adj)'] = -1*np.log10(experiment_df['P adj.'].values)
    # experiment_df['Error rate'] = [int(c.split('_')[-1])/100.0 for c in experiment_df['Plate']]
    # experiment_df['# Mutations'] = [len(v.split('_')) for v in experiment_df['Variant'].values]
    # experiment_df['# Mutations'] = [f'{v}' if v < 5 else f'>=5' for v in experiment_df['# Mutations']]
    # experiment_df = experiment_df.reset_index()
    # ax = sns.scatterplot(experiment_df, x='frequency', y='-log10(P.adj)', hue='Error rate', palette='viridis', 
    #                      style="# Mutations", style_order=['1', '2', '3', '4', '>=5'], s=80)
    # set_ax_params(ax)
    # plt.xlabel('Mutation frequency')
    # plt.ylabel('-log10(padj)')
    # plt.title('Effect of sequencing error on significance', fontsize=title_font_size, fontweight="bold")
    # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    # plt.savefig(f'{fig_dir}Experiment2_positions:{"-".join(positions)}.svg')
    # plt.show()

## Experiment 3: varying read depth and it's effect on significance (1 position)

In [None]:
# We're going to make an experiment of 10 plates with different sequencing error rates
sequencing_error_rate = 0.1
number_of_wells = 19

for position in [1, 20, 100]:
    positions = [position]
    experiment_df = pd.DataFrame()
    for read_depth in range(5, 100, 5):
        print(sequencing_error)
        run_df = make_experiment(f'ReadDepth_{read_depth}', read_depth, sequencing_error_rate, parent_sequence, positions, 
                         library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
        run_df.reset_index(inplace=True)
        experiment_df = pd.concat([experiment_df, run_df])
    
     # Also plot each one
    experiment_df.to_csv(f'{data_dir}Experiment3_position:{position}.csv', index=False)
    experiment_df['-log10(P.adj)'] = -1*np.log10(experiment_df['P adj.'].values)
    experiment_df['Read depth'] = [int(c.split('_')[-1]) for c in experiment_df['Plate']]
    experiment_df['# Mutations'] = [len(v.split('_')) for v in experiment_df['Variant'].values]
    experiment_df['# Mutations'] = [f'{v}' if v < 5 else f'>=5' for v in experiment_df['# Mutations']]
    experiment_df = experiment_df.reset_index()
    ax = sns.scatterplot(experiment_df, x='frequency', y='-log10(P.adj)', hue='Read depth', palette='viridis', 
                         style="# Mutations", style_order=['1', '2', '3', '4', '>=5'], s=80)
    set_ax_params(ax)
    plt.xlabel('Mutation frequency')
    plt.ylabel('-log10(padj)')
    plt.title('Effect of read depth on significance', fontsize=title_font_size, fontweight="bold")
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.savefig(f'{fig_dir}Experiment3_position:{position}.svg')
    plt.show()

## Experiment 4: effect of sequence length on significance


In [None]:
# We're going to make an experiment of 10 plates with different sequencing error rates
sequencing_error_rate = 0.1
number_of_wells = 19
read_depth = 25

for position in [1, 2, 3]:
    positions = [position]
    experiment_df = pd.DataFrame()
    for seq_len in range(5, 200, 20):
        print(sequencing_error)
        run_df = make_experiment(f'SeqLen_{seq_len}', read_depth, sequencing_error_rate, parent_sequence[:seq_len*3], positions, 
                 library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
        run_df.reset_index(inplace=True)
        experiment_df = pd.concat([experiment_df, run_df])
    
     # Also plot each one
    experiment_df.to_csv(f'{data_dir}Experiment4_position:{position}.csv', index=False)
    experiment_df['-log10(P.adj)'] = -1*np.log10(experiment_df['P adj.'].values)
    experiment_df['Seq length'] = [int(c.split('_')[-1]) for c in experiment_df['Plate']]
    experiment_df['# Mutations'] = [len(v.split('_')) for v in experiment_df['Variant'].values]
    experiment_df['# Mutations'] = [f'{v}' if v < 5 else f'>=5' for v in experiment_df['# Mutations']]
    experiment_df = experiment_df.reset_index()
    ax = sns.scatterplot(experiment_df, x='frequency', y='-log10(P.adj)', hue='Seq length', palette='viridis', 
                         style="# Mutations", style_order=['1', '2', '3', '4', '>=5'], s=80)
    set_ax_params(ax)
    plt.xlabel('Mutation frequency')
    plt.ylabel('-log10(padj)')
    plt.title('Effect of sequence length on significance', fontsize=title_font_size, fontweight="bold")
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.savefig(f'{fig_dir}Experiment4_position:{position}.svg')
    plt.show()

In [None]:
# We're going to make an experiment of 10 plates with different sequencing error rates
sequencing_error_rate = 0.1
number_of_wells = 19
read_depth = 25

for position in [1, 20, 100]:
    positions = [position]
    experiment_df = pd.DataFrame()
    for frequency_cutoff in range(10, 90, 10):
        run_df = make_experiment(f'FreqCutoff_{frequency_cutoff}', read_depth, sequencing_error_rate, parent_sequence, positions, library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff/100.0)
        run_df.reset_index(inplace=True)
        experiment_df = pd.concat([experiment_df, run_df])
    
     # Also plot each one
    experiment_df.to_csv(f'{data_dir}Experiment5_position:{position}.csv', index=False)
    # experiment_df['-log10(P.adj)'] = -1*np.log10(experiment_df['P adj.'].values)
    # experiment_df['Seq length'] = [int(c.split('_')[-1]) for c in experiment_df['Plate']]
    # experiment_df['# Mutations'] = [len(v.split('_')) for v in experiment_df['Variant'].values]
    # experiment_df['# Mutations'] = [f'{v}' if v < 5 else f'>=5' for v in experiment_df['# Mutations']]
    # experiment_df = experiment_df.reset_index()
    # ax = sns.scatterplot(experiment_df, x='frequency', y='-log10(P.adj)', hue='Seq length', palette='viridis', 
    #                      style="# Mutations", style_order=['1', '2', '3', '4', '>=5'], s=80)
    # set_ax_params(ax)
    # plt.xlabel('Mutation frequency')
    # plt.ylabel('-log10(padj)')
    # plt.title('Effect of frequency cutoff on significance', fontsize=title_font_size, fontweight="bold")
    # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    # plt.savefig(f'{fig_dir}Experiment5_position:{position}.svg')
    # plt.show()

In [None]:
# We're going to make an experiment of 10 plates with different sequencing error rates
sequencing_error_rate = 0.1
number_of_wells = 19
read_depth = 25
frequency_cutoff = 0.5 

for position in [1, 20, 100]:
    positions = [position]
    experiment_df = pd.DataFrame()
    for epcr_mutation_rate in range(1, 10, 1):
        run_df = make_experiment(f'ePCR_{epcr_mutation_rate}', read_depth, sequencing_error_rate, parent_sequence, positions, library_number, number_of_wells, epcr_mutation_rate/100.0, frequency_cutoff)
        run_df.reset_index(inplace=True)
        experiment_df = pd.concat([experiment_df, run_df])
    
     # Also plot each one
    experiment_df.to_csv(f'{data_dir}Experiment6_position:{position}.csv', index=False)
    # experiment_df['-log10(P.adj)'] = -1*np.log10(experiment_df['P adj.'].values)
    # experiment_df['Seq length'] = [int(c.split('_')[-1]) for c in experiment_df['Plate']]
    # experiment_df['# Mutations'] = [len(v.split('_')) for v in experiment_df['Variant'].values]
    # experiment_df['# Mutations'] = [f'{v}' if v < 5 else f'>=5' for v in experiment_df['# Mutations']]
    # experiment_df = experiment_df.reset_index()
    # ax = sns.scatterplot(experiment_df, x='frequency', y='-log10(P.adj)', hue='Seq length', palette='viridis', 
    #                      style="# Mutations", style_order=['1', '2', '3', '4', '>=5'], s=80)
    # set_ax_params(ax)
    # plt.xlabel('Mutation frequency')
    # plt.ylabel('-log10(padj)')
    # plt.title('Effect of frequency cutoff on significance', fontsize=title_font_size, fontweight="bold")
    # plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    # plt.savefig(f'{fig_dir}Experiment5_position:{position}.svg')
    # plt.show()