In [1]:
from abc import ABC, abstractmethod
from Bio.Data import CodonTable

import numpy as np
import pandas as pd
import random
import warnings

This notebook is for putting together the end-to-end tests of evSeq. We will reconstruct inputs for known outputs, then test them make sure the evSeq software outputs what we expect.

Steps:
1. Build a random amino acid sequence. This is our reference AA.
2. Decide if we are operating in detailed_refseq mode or not. If not, then the random amino acid sequence gives what we expect for the full plate.
3. Build variants of the input amino acid sequences. Decide if we have a mixed well or not. A mixed well means we need a second amino acid sequence.
4. Build both a "NNN" and no variation version of the reference sequence corresponding to positions where we expect mutations.
5. Choose a random codon to encode the amino acid changes in the sequence.
6. For building reads, choose:
    1. Ratio of different sequences. This decides whether or not they are recognized by `variable_thresh`
    2. Number of total sequences *at each position*. This decides whether a well is considered dead or not by `variable_count`
7. Build a set of perfect reads up to the total number of target sequences. 
8. Add on bad sequences that should get filtered out by the `bp_q_cutoff`, `length_cutoff`, and `average_q_cutoff` parameters. Also add indels.
9. Pad on additional sequence information that creates frameshifts. Add to an input file for processing.

Some methods are shared. These are held in an abstract base class:

In [2]:
class FakeData(ABC):
    """
    Abstract class holding shared methods between data generators for testing.
    """
    @abstractmethod
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    @abstractmethod
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        # Use the `perfect_reads` + `corrupted_bases` reads to generate what we expect
        # for the evSeq `ParsedFilteredFastqs` output
        pass

The BioPython codon table implementation is beyond bad, so we rewrite it here, fixing all of the BioPython flaws:

In [3]:
class CustomCodonTable():
    """
    Because the BioPython implementation of a codon table is all over the place,
    fixing all the flaws in this class. Not inheriting because I just don't trust it.
    """
    def __init__(self):
        
        # Get the standard codon table from BioPython. Their docs do not say what this is,
        # but it looks right from a spot check. As it is, we wrote our own for evSeq, so it
        # is good to have a separate codon table used for validation.
        bio_codon_table = CodonTable.standard_dna_table
        
        # Build the forward dictionary. Add the stop codon translations (why
        # aren't these included in the first place???).
        self.codon_to_aa = bio_codon_table.forward_table.copy()
        for stop_codon in bio_codon_table.stop_codons:
            self.codon_to_aa[stop_codon] = "*"
            
        # Build the reverse dictionary. BioPython reverse dictionary includes just one 
        # codon per aa. Fix this.
        self.aa_to_codon = {}
        for codon, aa in self.codon_to_aa.items():
            if aa in self.aa_to_codon:
                self.aa_to_codon[aa].append(codon)
            else:
                self.aa_to_codon[aa] = [codon]

We need global variables that define the bounds on our testing. Everything is randomly produced, so we need to set ranges on our sampling:

In [4]:
# Import evSeq globals needed for testing
from evSeq.util.globals import (BARCODE_LENGTH, 
                                ADAPTER_F,
                                ADAPTER_LENGTH_F,
                                ADAPTER_R,
                                ADAPTER_LENGTH_R)

# Reference sequence bounds (IN NUMBER OF AAS!!)
MIN_REFSEQ_LEN = 50
MAX_REFSEQ_LEN = 251

# Bounds on readlength
MIN_READLENGTH = 150
MAX_READLENGTH = 301
assert (MIN_REFSEQ_LEN * 3) <= MIN_READLENGTH

# Bounds on quality cutoffs
MIN_BP_QUAL_CUTOFF = 15
MAX_BP_QUAL_CUTOFF = 36

MIN_GLOBAL_QUAL_CUTOFF = 15
MAX_GLOBAL_QUAL_CUTOFF = 36

MAX_QUAL_ALLOWED = 41

# Bounds on sequence length cutoffs
MIN_SEQLEN_CUTOFF = 0.0
MAX_SEQLEN_CUTOFF = 1.0

# Bounds on number of indels added
MIN_INDELS_ADDED = 1
MAX_INDELS_ADDED = 4

# Bounds on primer lengths
PRIMER_MIN_LEN = 10
PRIMER_MAX_LEN = 41

# Bounds on the frameshift for a reference seq
FRAMESHIFT_MIN = 0
FRAMESHIFT_MAX = 4

# Bounds on position identification
MIN_VARIABLE_THRESH = 0.0
MAX_VARIABLE_THRESH = 1.0
MIN_VARIABLE_COUNT = 0
MAX_VARIABLE_COUNT = 21

# Number of variants in a well
MIN_N_VARIANTS = 1
MAX_N_VARIANTS = 4

# Number of reads in a well
MIN_N_READS = 0
MAX_N_READS = 101

# Bounds on number of mutations per sequence (as a fraction
# of the number of amino acids captured by the read-length)
MIN_PERC_MUTATED = 0.0
MAX_PERC_MUTATED = 0.2

# Number of amino acids to rescue in overlapping regions (as
# a fraction of the total number of rescue options)
MIN_RESCUE_PERC = 0.0
MAX_RESCUE_PERC = 0.5

# Allowed nucleotides
ALLOWED_NUCLEOTIDES = ("A", "T", "C", "G")

# Codon table and allowed amino acid characters
CODON_TABLE = CustomCodonTable()
ALLOWED_AAS = tuple(sorted(list(CODON_TABLE.aa_to_codon.keys())))
INT_TO_AA = dict(enumerate(ALLOWED_AAS))
N_AAS = len(ALLOWED_AAS)

# Random number generators
RANDOM_SEED = sum(ord(char) for char in "PDawg")
NP_RNG = np.random.default_rng(RANDOM_SEED)
RANDOM_RNG = random.Random(RANDOM_SEED)

# Barcode file
INDEX_DF = pd.read_csv("../evSeq/util/index_map.csv")
N_INDICES = len(INDEX_DF)
N_PLATES = len(INDEX_DF.IndexPlate.unique())

We will store basic variables used for building test data in a configuration class, defined below:

In [5]:
class QualityGenerator():
    """
    Utility for generating quality score arrays
    """
    def __init__(self, min_q_allowed):
        self.min_q_allowed = min_q_allowed
        
    def generate_qualities(self, size):
        return NP_RNG.integers(self.min_q_allowed, MAX_QUAL_ALLOWED,
                               size = size)

class FakeRefseq():
    def __init__(self):
        
        # Randomly create an amino acid reference sequence. Record the length.
        self.refseq_len = NP_RNG.integers(MIN_REFSEQ_LEN, MAX_REFSEQ_LEN)
        aa_ints = NP_RNG.choice(N_AAS, size = self.refseq_len)
        self.aa_refseq = [INT_TO_AA[aa_int] for aa_int in aa_ints]

        # Assign a codon to each amino acid.
        self.codon_refseq = [RANDOM_RNG.choice(CODON_TABLE.aa_to_codon[aa]) 
                             for i, aa in enumerate(self.aa_refseq)] 
        self.codon_refseq_len = self.refseq_len * 3
        
        # Decide on the frameshift of the reference sequence
        self.frameshift_front = NP_RNG.integers(FRAMESHIFT_MIN, FRAMESHIFT_MAX)
        self.frameshift_back = NP_RNG.integers(FRAMESHIFT_MIN, FRAMESHIFT_MAX)        
        
        # Determine the bases for the frameshift
        self.frameshift_bp_front = RANDOM_RNG.choices(ALLOWED_NUCLEOTIDES, 
                                                      k = self.frameshift_front)
        self.frameshift_bp_back = RANDOM_RNG.choices(ALLOWED_NUCLEOTIDES, 
                                                     k = self.frameshift_back)
                
        # Create primer seeds
        self.primer_seed_len_f = NP_RNG.integers(PRIMER_MIN_LEN, PRIMER_MAX_LEN)
        self.primer_seed_len_r = NP_RNG.integers(PRIMER_MIN_LEN, PRIMER_MAX_LEN)
        self.primer_seed_f = RANDOM_RNG.choices(ALLOWED_NUCLEOTIDES, k = self.primer_seed_len_f)
        self.primer_seed_r = RANDOM_RNG.choices(ALLOWED_NUCLEOTIDES, k = self.primer_seed_len_r)
        
        # Assign the total readable window
        self.readable_window_len = (self.codon_refseq_len + 
                                    self.frameshift_front +
                                    self.frameshift_back + 
                                    self.primer_seed_len_f + 
                                    self.primer_seed_len_r +
                                    ADAPTER_LENGTH_F +
                                    ADAPTER_LENGTH_R +
                                    2 * BARCODE_LENGTH)
    
    def define_refseq_windows(self, readlength):
        """
        Defines the region within the mutable refseq region that can be modified.
        """
        # Choose legal positions for making mutations. For forward, the first readable comes after
        # the barcode, adapter, and seed region; this is the last readable for reverse. For forward,
        # the last readable . ForThis is the last amino acid
        # captured in full by the readlength, considering the primer binding region,
        # the adapter machinery, and the barcode.
        effective_readlength = readlength - BARCODE_LENGTH
        max_readable_aa_ind_f = (effective_readlength - self.frameshift_front -
                                 ADAPTER_LENGTH_F - self.primer_seed_len_f) // 3
        min_readable_aa_ind_r = (effective_readlength - self.frameshift_back - 
                                 ADAPTER_LENGTH_R - self.primer_seed_len_r) // 3
        
        aa_refseq_len = len(self.aa_refseq)
        self.forward_readable_aas = list(range(0, max_readable_aa_ind_f))
        self.reverse_readable_aas = list(range(aa_refseq_len - min_readable_aa_ind_r, aa_refseq_len))
        mutable_aa_inds = self.forward_readable_aas + self.reverse_readable_aas
        
        # Get only unique mutable inds
        self.mutable_aa_inds = np.array(list(set(mutable_aa_inds)), dtype = int)
        self.mutable_aa_inds.sort()
        
        # Get the set of indices that is captured by both the forward and reverse reads
        self.double_count_inds = set(self.forward_readable_aas) & set(self.reverse_readable_aas)
        
    def assign_qualities(self, min_q_allowed):
        """
        Assigns base quality scores to all sequence components. This includes:
        1. Primer seeds
        2. Frameshifts
        3. The codon refseq
        """
        # Create a quality generator
        q_generator = QualityGenerator(min_q_allowed)
        
        # Primer seed qualities
        self.primer_qualities_f = q_generator.generate_qualities(self.primer_seed_len_f)
        self.primer_qualities_r = q_generator.generate_qualities(self.primer_seed_len_r)
        
        # Barcode qualities
        self.fbc_qualities = q_generator.generate_qualities(BARCODE_LENGTH)
        self.rbc_qualities = q_generator.generate_qualities(BARCODE_LENGTH)
        
        # Adapter qualities
        self.adapter_qualities_f = q_generator.generate_qualities(ADAPTER_LENGTH_F)
        self.adapter_qualities_r = q_generator.generate_qualities(ADAPTER_LENGTH_R)
        
        # Frameshift qualities
        self.frameshift_front_qualities = q_generator.generate_qualities(self.frameshift_front)
        self.frameshift_back_qualities = q_generator.generate_qualities(self.frameshift_back)
        
        # Variable region base qualities
        self.base_variable_qualities = q_generator.generate_qualities(self.refseq_len * 3)        
        
    @staticmethod
    def aa_seq_to_codon_seq(aa_seq):
        return [RANDOM_RNG.choice(CODON_TABLE.aa_to_codon[aa]) 
                for i, aa in enumerate(aa_seq)]         
        
    @property
    def refseq_bp_seq(self):
        """
        Returns the sequence of the refseq that will be fed into evSeq
        """
        return "".join(
            self.primer_seed_f +
            self.frameshift_bp_front +
            self.codon_refseq +
            self.frameshift_bp_back +
            self.primer_seed_r
        )
    
    @property
    def refseq_bp_qualities(self):
        """
        Returns the qualities of the sequence returned by `refseq_bp_seq`
        """
        return np.concatenate((
            self.primer_qualities_f,
            self.frameshift_front_qualities,
            self.base_variable_qualities,
            self.frameshift_back_qualities,
            self.primer_qualities_r
        ))
        
# Class that holds parameters that will be needed by evSeq
class Config():
    def __init__(self, detailed = True):
        """
        Builds a set of conditions that might be passed into an evSeq run.
        """
        # Record whether or not this is a detailed run
        self.detailed = detailed
        
        # Build as many reference sequences as needed
        n_refs_needed = N_INDICES if detailed else N_PLATES
        self.refseqs = [FakeRefseq() for _ in range(n_refs_needed)] 
        
        # Decide on the readlength that we will be using. The maximum allowed
        # readlength cannot be longer than the full sequencable length of
        # the shortest a reference sequence
        min_refseq_len = min(refseq.readable_window_len for refseq in self.refseqs)
        max_readlength = min(min_refseq_len, MAX_READLENGTH)
        self.readlength = NP_RNG.integers(MIN_READLENGTH, max_readlength)
                
        # Decide on the input variables that will define the run, including 
        # average_q_cutoff, bp_q_cutoff, length_cutoff, variable_thresh, and
        # variable_count
        self.average_q_cutoff = NP_RNG.integers(MIN_GLOBAL_QUAL_CUTOFF, MAX_GLOBAL_QUAL_CUTOFF)
        self.bp_q_cutoff = NP_RNG.integers(MIN_BP_QUAL_CUTOFF, MAX_BP_QUAL_CUTOFF)
        self.length_cutoff = NP_RNG.uniform(MIN_SEQLEN_CUTOFF, MAX_SEQLEN_CUTOFF)
        self.variable_thresh = NP_RNG.uniform(MIN_VARIABLE_THRESH, MAX_VARIABLE_THRESH)
        self.variable_count = NP_RNG.integers(MIN_VARIABLE_COUNT, MAX_VARIABLE_COUNT)
        
        # Build base quality scores for the reference sequences. Also define the allowed
        # mutagenesis windows for the amino acids, which encompasses the last amino acids
        # captured in full by the readlength. 
        for refseq in self.refseqs:
            refseq.assign_qualities(max(self.average_q_cutoff, self.bp_q_cutoff))
            refseq.define_refseq_windows(self.readlength)

The basic data generator is a FakeWell instance. This holds methods for building fake data from the Config object.

In [6]:
# Create a class that holds all relevant information for a variant
class Variant(FakeData):
    def __init__(self, refseq, counts, minimum_reads_allowed):
        
        # The refseq is an instance variable
        self.refseq = refseq
        
        # Set the total number of counts for the variant and the
        # minimum reads that are allowed for it
        self.total_counts = counts
        self.minimum_reads_allowed = minimum_reads_allowed
        
        # Choose the variable positions
        self.choose_variable_positions()
        
        # Assign Q scores
        
        # Add noise to the codons, making sure to consider (1) that we don't
        # accidentally drop below the read threshold and (2) whether the overlap
        # between forward and reverse can cover for any read errors
        
    def choose_variable_positions(self):
        """
        Chooses the variable positions for the variant and assigns mutant
        amino acids/codons.
        """
        # Get the number of mutations to make
        n_mutable_positions = len(self.refseq.mutable_aa_inds)
        min_n_muts = int(MIN_PERC_MUTATED * n_mutable_positions)
        max_n_muts = int(MAX_PERC_MUTATED * n_mutable_positions) + 1
        self.n_variable_positions = NP_RNG.integers(min_n_muts, max_n_muts)
        
        # Decide on the positions that will be varied within the variant
        # This must be in the readable region. 
        self.mutated_positions = NP_RNG.choice(self.refseq.mutable_aa_inds, 
                                               size = self.n_variable_positions,
                                               replace = False)
        self.mutated_positions.sort()
        
        # Choose variable amino acids
        self.variable_aas = RANDOM_RNG.choices(ALLOWED_AAS, k = self.n_variable_positions)
        
        # Choose a random codon for encoding said amino acids. It cannot be the
        # same as the existing codon.
        self.variable_codons = [None] * self.n_variable_positions
        for i, aa in enumerate(self.variable_aas):
            
            # Get the options for the alternate codons. Again, we cannot
            # reuse the codon that is already present in the refseq
            existing_codon = self.refseq.codon_refseq[self.mutated_positions[i]]
            codon_opts = [codon for codon in CODON_TABLE.aa_to_codon[aa]
                          if codon != existing_codon]
            
            # If there are no alternate codons (e.g., if we selected methionine as
            # our mutant aa and refseq was already methionine), then just sample from
            # leucine
            if len(codon_opts) == 0:
                self.variable_aas[i] = "L"
                codon_opts = CODON_TABLE.aa_to_codon["L"]
                
            # Select the new codon
            self.variable_codons[i] = RANDOM_RNG.choice(codon_opts)            
        
    def assign_qualities(self):
        """
        Assigns counts to different bases and amino acids, then assigns quality scores
        to get them to these counts
        """
        # Add noise to the codons, dropping them below the perfect counts
        # Here we need to make sure to consider (1) that we don't
        # accidentally drop below the read threshold and (2) whether the overlap
        # between forward and reverse can cover for any read errors
              
        
        # Make sure the qualities of the frameshift components are also added on
        # to this output
        pass
    
    def build_expected_count_arrays(self):

        # Create arrays of counts.
        expected_aa_counts = np.full(self.n_variable_positions, 
                                     self.total_counts)
        expected_bp_counts = np.full(self.n_variable_positions * 3, 
                                     self.total_counts)

        # Double positions in the counts where we have overlap 
        for i, mutant_pos in enumerate(self.mutated_positions):
            if mutant_pos in self.refseq.double_count_inds:

                # Double aa counts
                expected_aa_counts[i] *= 2

                # Double bp counts
                bp_start_ind = i * 3
                for bp_ind in range(bp_start_ind, bp_start_ind + 3):
                    expected_bp_counts[bp_ind] *= 2

        return expected_aa_counts, expected_bp_counts
        
    def build_perfect_reads(self):
        """
        Assigns a `perfect_reads` variable to the instance.
        """
        pass
        # Decide on the ratio between the different variants
        
        # Get the total sequences we want at each **position**
        
        # Build a set of perfect reads up to the target numbers
        
    def build_corrupted_reads(self):
        """
        Assigns `corrupted_reads` and `corrupted_bases` variables to the instance.
        """
        pass
        # Choose how many sequences to add of each flavor.
        
        # Add sequences filtered out by bp_q_cutoff. This is `corrupted_bases`
        
        # Add indels. These will go into `corrupted_bases`. We need to record the positions 
        # where indels are found.
        
        # Add sequences filtered out by the average_q_cutoff. This is `corrupted_reads`.
        
        # Add sequences filtered out by the length_cutoff. This is added to `corrupted_reads`.
        
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        pass
    
# Class that holds information for a test well
class FakeWell(FakeData):
    def __init__(self, config, reference_sequence):
    
        # Assign the config and reference sequence objects as instance variables
        self.config = config
        self.refseq = reference_sequence
        
        # Create a barcode variable as a placeholder. This will be filled when the well
        # is passed with others to a FakeRun instance.
        self.f_barcode = None
        self.r_barcode = None
                
        # Get the total number of reads in the well. 
        self.total_reads = NP_RNG.integers(MIN_N_READS, MAX_N_READS)
        
        # Decide on how many variants we want in the well. 
        self.assign_n_variants()
        
        # Decide the relative abundance of each variant, then build variants
        variant_abundances, minimum_reads_per_variant = self.calculate_variant_abundances()
        self.variants = [Variant(reference_sequence, abundance, minimum_reads_per_variant) for
                         abundance in variant_abundances]   
        
        # Create a "NNN" version of the reference sequence for use in processing.
        # Note that we don't need to capture all positions -- we want evSeq to
        # handle finding some of them sometimes. Thus, the NNN refseq is derived from
        # the first variant in the well
        
        # Also at this step, add padding to the refseq.
#         raise NotImplementedError()
         
    def assign_n_variants(self):
        """
        Determines how many variants are in the well.
        """
        # Decide on how many variants we want in the well. The maximum allowed is
        # the minimum of (1) how many variants we can spread reads over to get above
        # the `variable_count` threshold, (2) the most allowed with the given 
        # `variable_thresh`, and (3) the `MAX_N_VARIANTS` global.
        count_divisor = 1 if self.config.variable_count == 0 else self.config.variable_count
        max_n_variants = min(
            self.total_reads // count_divisor,
            int(1 // self.config.variable_thresh),
            MAX_N_VARIANTS
        )
        
        # Make it so that we always have at least one variant
        if max_n_variants == 0:
            max_n_variants = 1
        self.n_variants = NP_RNG.integers(MIN_N_VARIANTS, max_n_variants + 1)
    
    def calculate_variant_abundances(self):
        """
        Decide how many counts go to each variant.
        """
        # Each variant must be more abundant than both the variable threshold
        # and the variable counts
        minimum_reads_per_variant = max(
            int(np.ceil(self.config.variable_thresh * self.total_reads)),
            self.config.variable_count
        )

        # Now assign read counts to each variant. If there is only 1 variant, then it gets
        # all reads
        if self.n_variants == 1:
            variant_counts = [self.total_reads]

        # If there are more than 1 variants, for each variant, we sample
        # from the range of minimum reads per variant to total reads 
        # remaining, considering that some reads have already been
        # assigned to variants
        else:
            total_reads_available = self.total_reads
            variants_remaining = self.n_variants
            variant_counts = [None] * self.n_variants
            for i in range(self.n_variants):

                # Get the maximum number of reads that we can sample
                max_reads_available_ind = FakeWell.calculate_maximum_reads_ind(total_reads_available, 
                                                                               variants_remaining,
                                                                               minimum_reads_per_variant)

                # Sample to get the number of variants
                sampled_variants = NP_RNG.integers(minimum_reads_per_variant, max_reads_available_ind)
                variant_counts[i] = sampled_variants

                # Update the number of reads available for sampling and the number of variants
                # still needing samples
                total_reads_available -= sampled_variants
                variants_remaining -= 1

                # If this is the breakpoint, assign the remaining reads to the remaining variant
                if variants_remaining == 1:
                    variant_counts[-1] = total_reads_available
                    break

        # Checks to make sure the counts were assigned appropriately
        assert not any(count is None for count in variant_counts)
        assert sum(variant_counts) == self.total_reads   
        
        return variant_counts, minimum_reads_per_variant
        
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        pass
    
    @staticmethod
    def calculate_maximum_reads_ind(total_reads_available, total_variants, minimum_reads_per_variant):
        
        # Get the upper bound of sampling maximum reads
        return total_reads_available - ((total_variants - 1) * minimum_reads_per_variant) + 1

In [7]:
test_config = Config()
test_well = FakeWell(test_config, test_config.refseqs[0])
test_variant = test_well.variants[0]
test_variant.refseq.double_count_inds

{39, 40, 41, 42, 43}

In [8]:
test_variant.minimum_reads_allowed

13

In [9]:
test_variant.total_counts

41

In [10]:
test_variant.variable_aas

['I', 'S', 'Y', 'P', 'S', 'L', 'P', 'E', 'A', '*', 'N', 'E', 'Q', 'E', 'L']

In [11]:
test_variant.variable_codons

['ATA',
 'AGT',
 'TAT',
 'CCT',
 'AGC',
 'TTG',
 'CCA',
 'GAG',
 'GCA',
 'TAA',
 'AAC',
 'GAG',
 'CAG',
 'GAA',
 'TTA']

In [12]:
test_variant.mutated_positions

array([ 9, 18, 27, 31, 32, 37, 39, 41, 46, 55, 63, 65, 66, 75, 76])

In [13]:
# Build expected output counts for amino acids and bases
expected_aa_counts, expected_bp_counts = test_variant.build_expected_count_arrays()

# Decide how many reads for combos of amino acids will have noise
# added to them. This number must fall above the minimum counts required.
buffer_region = test_variant.total_counts - test_variant.minimum_reads_allowed
n_combos_destroyed = NP_RNG.integers(buffer_region)

# Get the expected number of both bp and aa combinations
expected_combo_counts = test_variant.total_counts - n_combos_destroyed 

# Decide which reads will have noise added.
read_inds = np.arange(test_variant.total_counts)
noisy_reads = NP_RNG.choice(read_inds, 
                            size = n_combos_destroyed,
                            replace = False)
noisy_reads.sort()

# Decide which amino acids within each read will have noise added.
# Do not add noise to codons within 1 of the double overlap region.
forbidden_noisy_aas = {min(test_variant.refseq.double_count_inds), 
                       max(test_variant.refseq.double_count_inds)}
noisened_position_options = [pos for pos in test_variant.mutated_positions
                             if pos not in forbidden_noisy_aas]
noisy_positions = RANDOM_RNG.choices(noisened_position_options,
                                     k = n_combos_destroyed)
noisy_pos_set = set(noisy_positions)

# Create two quality score arrays. One is for the forward read and the other
# is for the reverse reads
q_score_array_f = test_variant.refseq.base_variable_qualities.copy()
q_score_array_r = q_score_array_f.copy()

# Add noise to positions. Adjust counts and qualities accordingly.





# Of amino acids found in the overlapping region, determine which ones
# will need to be rescued (i.e., have only the forward or reverse read damaged)
rescue_options = [pos for pos in test_variant.refseq.double_count_inds
                  if pos not in noisy_pos_set]
n_rescue_opts = len(rescue_options)
min_to_rescue = int(MIN_RESCUE_PERC * n_rescue_opts)
max_to_rescue = int(MAX_RESCUE_PERC * n_rescue_opts) + 1
n_to_rescue = NP_RNG.integers(min_to_rescue, max_to_rescue)
aas_to_rescue = RANDOM_RNG.choices(rescue_options, k = n_to_rescue)

# Create rescued positions. Adjust counts and qualities accordingly.


# Note that rescued aas can be broken in one of two ways: (1) setting Q-score too
# low and (2) having Q-score lower in one read than the other and including the
# wrong basepair

# Reduce quality scores of individual bases below the allowed threshold. Record
# the new counts for bases.

In [14]:
expected_aa_counts

array([41, 41, 41, 41, 41, 41, 82, 82, 41, 41, 41, 41, 41, 41, 41])

In [79]:
expected_bp_counts

array([41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 82, 82, 82, 82, 82, 82, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41])

In [77]:
test_variant.refseq.double_count_inds

{39, 40, 41, 42, 43}

In [78]:
test_variant.mutated_positions

array([ 9, 18, 27, 31, 32, 37, 39, 41, 46, 55, 63, 65, 66, 75, 76])

In [34]:
n_combos_destroyed

5

In [24]:
buffer_region

28

All FakeWell instances are held within a FakeRun instance that assigns barcodes to the fake wells, handles fake input generation, and builds expected outputs.

In [5]:
# Class that holds information for a test run
def FakeRun(FakeData):
    def __init__(self, fakewells, detailed = True):
        """
        fakewells: A list of fully prepared FakeWell objects.
        detailed: Whether or not we are using a detailed refseq file. 
        """
        # Assign fakewells as an instance variable
        pass
    
    def build_input(self):
        pass
        # Assign a barcode combo to each of the fakewells
        
        # Build fastq files from the fakewells
        
        # Return the fastq files ready for processing
        
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        pass