# Simulation experiment for detection thresholds for variant calling

This notebook is to get detection thresholds for the limit of the variant calling based on: 

1. Read depth
2. Error rate
3. Number of mutations

The data from this comes from: https://pubs.acs.org/doi/10.1021/acscentsci.7b00548 Ape AGW


In [1]:
import pandas as pd
# Visualisation things to make the figures look nice
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sciutil import SciUtil


data_dir = 'ePCR_data/'
fig_dir = 'ePCR_figures/'


plt.rcParams['svg.fonttype'] = 'none'
axis_line_width = 1.0
axis_font_size = 12
title_font_size = 12
label_font_size = 10
figsize = (5, 4) # Figure size
font = 'Arial'
style = 'ticks'
font_family = 'sans-serif'

# Create a figure with 2x2 subplots
sns.set_style("whitegrid")
cmap = 'viridis'
palette = sns.color_palette("viridis", as_cmap=True)
sns.set_palette(cmap)

sns.set(rc={'figure.figsize': figsize, 'font.family': font_family,
            'font.sans-serif': font, 'font.size': label_font_size}, style=style)

def set_ax_params(ax):
    ax.tick_params(direction='out', length=2, width=axis_line_width)
    ax.spines['bottom'].set_linewidth(axis_line_width)
    ax.spines['top'].set_linewidth(0)
    ax.spines['left'].set_linewidth(axis_line_width)
    ax.spines['right'].set_linewidth(0)
    ax.tick_params(labelsize=axis_font_size)
    ax.tick_params(axis='x', which='major', pad=2.0)
    ax.tick_params(axis='y', which='major', pad=2.0)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    ax.tick_params(labelsize=label_font_size)


u = SciUtil()

# Generate mutations on a "real" sequence
# Decided to choose tauD from Ecoli K12
parent_sequence_aa = 'MTPSDIPGYDYGRVEKSPITDLEFDLLKKTVMLGEKDVMYLKKACDVLKDQVDEILDLAGGWVASNEHLIYYFSNPDTGEPIKEYLERVRARFGAWILDTTCRDYNREWLDYQYEVGLRHHRSKKGVTDGVRTVPHIPLRYLIAWIYPITATIKPFLAKKGGSPEDIEGMYNAWFKSVVLQVAIWSHPYTKENDWLEHHHHHH*'

parent_sequence = 'ATGACTCCCTCGGACATCCCGGGATATGATTATGGGCGTGTCGAGAAGTCACCCATCACGGACCTTGAGTTTGACCTTCTGAAGAAGACTGTCATGTTAGGTGAAAAGGACGTAATGTACTTGAAAAAGGCGTGTGACGTTCTGAAAGATCAAGTTGATGAGATCCTTGACTTGGCGGGTGGTTGGGTAGCATCAAATGAGCATTTGATTTATTACTTCTCCAATCCGGATACAGGAGAGCCTATTAAGGAATACCTGGAACGTGTACGCGCTCGCTTTGGAGCCTGGATTCTGGACACTACCTGCCGCGACTATAACCGTGAATGGTTAGACTACCAGTACGAAGTTGGGCTTCGTCATCACCGTTCAAAGAAAGGGGTCACAGACGGAGTACGCACCGTGCCCCATATCCCACTTCGTTATCTTATCGCATGGATCTATCCTATCACCGCCACTATCAAGCCATTTTTGGCTAAGAAAGGTGGCTCTCCGGAAGACATCGAAGGGATGTACAACGCTTGGTTCAAGTCTGTAGTTTTACAAGTTGCCATCTGGTCACACCCTTATACTAAGGAGAATGACTGGCTCGAGCACCACCACCACCACCACTGA'

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Introduce mutations at a given frequency and an error rate

Test setup:

Change to 0.1 - 5% 0.2% step size. 

1. For number of mutations from 1 to the sequence length test mutating each one and correlate this to the p value
2. For sequencing error rates from 0 to 100% make sequences ranging with this and see what the results are for the p value and the error
3. For different sequence lengths also check how sequence length corresponds to the pvalue

In [6]:
from minION import *
from tqdm import tqdm

label = 'ApeAGW'

## Experiment 1: Varying the sequencing error rate for a single mutation

In [10]:
# We're going to make an experiment of 10 plates with different sequencing error rates
read_depth = 25
number_of_wells = 96
epcr_mutation_rate = 0.02
frequency_cutoff = 0.5
library_number = 96 # Usually do a 96 well plate
verbose = False
sequencing_error = 0.1

experiment_df = pd.DataFrame()
for sequencing_error in range(0, 50, 5):
    sequencing_error_rate = sequencing_error/100.0
    run_df = make_experiment(f'SeqError_{sequencing_error}', read_depth, sequencing_error_rate, parent_sequence,
                             library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
    run_df.reset_index(inplace=True)
    experiment_df = pd.concat([experiment_df, run_df])

# Also plot each one
experiment_df.to_csv(f'{data_dir}Experiment1.csv', index=False)

100%|███████████████████████████████████████████| 96/96 [00:29<00:00,  3.25it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:32<00:00,  2.96it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:36<00:00,  2.63it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:38<00:00,  2.46it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:41<00:00,  2.30it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:40<00:00,  2.40it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:39<00:00,  2.44it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████

## Experiment 2: varying read depth and it's effect on significance

In [11]:
# We're going to make an experiment of 10 plates with different sequencing error rates
read_depth = 25
number_of_wells = 96
epcr_mutation_rate = 0.02
frequency_cutoff = 0.5
library_number = 96 # Usually do a 96 well plate
verbose = False
sequencing_error = 0.1

experiment_df = pd.DataFrame()
for read_depth in range(5, 100, 5):
    run_df = make_experiment(f'ReadDepth_{read_depth}', read_depth, sequencing_error_rate, parent_sequence, 
                     library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
    run_df.reset_index(inplace=True)
    experiment_df = pd.concat([experiment_df, run_df])

 # Also plot each one
experiment_df.to_csv(f'{data_dir}Experiment2_{label}.csv', index=False)

  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:38<00:00,  2.50it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:39<00:00,  2.43it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:46<00:00,  2.05it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:48<00:00,  1.99it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:51<00:00,  1.86it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:53<00:00,  1.81it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:52<00:00,  1.83it/s]
  variant_df.at[current_well, "frequency"] = fre

## Experiment 3: effect of sequence length on significance


In [12]:
# We're going to make an experiment of 10 plates with different sequencing error rates
read_depth = 25
number_of_wells = 96
epcr_mutation_rate = 0.02
frequency_cutoff = 0.5
library_number = 96 # Usually do a 96 well plate
verbose = False
sequencing_error = 0.1


experiment_df = pd.DataFrame()
for seq_len in range(5, 2000, 50):
    if seq_len >= len(parent_sequence):
        break
    run_df = make_experiment(f'SeqLen_{seq_len}', read_depth, sequencing_error_rate, parent_sequence[:seq_len*3],
             library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff)
    run_df.reset_index(inplace=True)
    experiment_df = pd.concat([experiment_df, run_df])

# Also plot each one
experiment_df.to_csv(f'{data_dir}Experiment3_{label}.csv', index=False)


  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 55.96it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:12<00:00,  7.71it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:21<00:00,  4.39it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:31<00:00,  3.04it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:41<00:00,  2.33it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:41<00:00,  2.30it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:41<00:00,  2.33it/s]
  variant_df.at[current_well, "frequency"] = fre

## Experiment 4: effect of frequency cutoff

In [13]:
# We're going to make an experiment of 10 plates with different sequencing error rates
read_depth = 25
number_of_wells = 96
epcr_mutation_rate = 0.02
frequency_cutoff = 0.5
library_number = 96 # Usually do a 96 well plate
verbose = False
sequencing_error = 0.1

experiment_df = pd.DataFrame()
for frequency_cutoff in range(5, 100, 10):
    run_df = make_experiment(f'FreqCutoff_{frequency_cutoff}', read_depth, sequencing_error_rate, parent_sequence, library_number, number_of_wells, epcr_mutation_rate, frequency_cutoff/100.0)
    run_df.reset_index(inplace=True)
    experiment_df = pd.concat([experiment_df, run_df])

 # Also plot each one
experiment_df.to_csv(f'{data_dir}Experiment4_{label}.csv', index=False)

  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:44<00:00,  2.16it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:47<00:00,  2.02it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:46<00:00,  2.05it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:49<00:00,  1.96it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:49<00:00,  1.92it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:46<00:00,  2.07it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:53<00:00,  1.78it/s]
  variant_df.at[current_well, "frequency"] = fre

## Experiment 5: ePCR mutation rate

In [14]:
# We're going to make an experiment of 10 plates with different sequencing error rates
read_depth = 25
number_of_wells = 96
epcr_mutation_rate = 0.02
frequency_cutoff = 0.5
library_number = 96 # Usually do a 96 well plate
verbose = False
sequencing_error = 0.1


experiment_df = pd.DataFrame()
for epcr_mutation_rate in range(1, 20, 2):
    run_df = make_experiment(f'ePCR_{epcr_mutation_rate}', read_depth, sequencing_error_rate, parent_sequence, library_number, number_of_wells, 
                             epcr_mutation_rate/1000.0, frequency_cutoff)
    run_df.reset_index(inplace=True)
    experiment_df = pd.concat([experiment_df, run_df])

 # Also plot each one
experiment_df.to_csv(f'{data_dir}Experiment5_{label}.csv', index=False)


  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 42/42 [00:21<00:00,  1.92it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 86/86 [00:43<00:00,  1.98it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 91/91 [00:42<00:00,  2.13it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:46<00:00,  2.05it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:49<00:00,  1.93it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:45<00:00,  2.10it/s]
  variant_df.at[current_well, "frequency"] = frequency
100%|███████████████████████████████████████████| 96/96 [00:43<00:00,  2.23it/s]
  variant_df.at[current_well, "frequency"] = fre