In [1]:
from abc import ABC, abstractmethod
from Bio.Data import CodonTable

import numpy as np
import pandas as pd
import random

This notebook is for putting together the end-to-end tests of evSeq. We will reconstruct inputs for known outputs, then test them make sure the evSeq software outputs what we expect.

Steps:
1. Build a random amino acid sequence. This is our reference AA.
2. Decide if we are operating in detailed_refseq mode or not. If not, then the random amino acid sequence gives what we expect for the full plate.
3. Build variants of the input amino acid sequences. Decide if we have a mixed well or not. A mixed well means we need a second amino acid sequence.
4. Build both a "NNN" and no variation version of the reference sequence corresponding to positions where we expect mutations.
5. Choose a random codon to encode the amino acid changes in the sequence.
6. For building reads, choose:
    1. Ratio of different sequences. This decides whether or not they are recognized by `variable_thresh`
    2. Number of total sequences *at each position*. This decides whether a well is considered dead or not by `variable_count`
7. Build a set of perfect reads up to the total number of target sequences. 
8. Add on bad sequences that should get filtered out by the `bp_q_cutoff`, `length_cutoff`, and `average_q_cutoff` parameters. Also add indels.
9. Pad on additional sequence information that creates frameshifts. Add to an input file for processing.

Some methods are shared. These are held in an abstract base class:

In [2]:
class FakeData(ABC):
    """
    Abstract class holding shared methods between data generators for testing.
    """
    @abstractmethod
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    @abstractmethod
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        # Use the `perfect_reads` + `corrupted_bases` reads to generate what we expect
        # for the evSeq `ParsedFilteredFastqs` output
        pass

The BioPython codon table implementation is beyond bad, so we rewrite it here, fixing all of the BioPython flaws:

In [3]:
class CustomCodonTable():
    """
    Because the BioPython implementation of a codon table is all over the place,
    fixing all the flaws in this class. Not inheriting because I just don't trust it.
    """
    def __init__(self):
        
        # Get the standard codon table from BioPython. Their docs do not say what this is,
        # but it looks right from a spot check. As it is, we wrote our own for evSeq, so it
        # is good to have a separate codon table used for validation.
        bio_codon_table = CodonTable.standard_dna_table
        
        # Build the forward dictionary. Add the stop codon translations (why
        # aren't these included in the first place???).
        self.codon_to_aa = bio_codon_table.forward_table.copy()
        for stop_codon in bio_codon_table.stop_codons:
            self.codon_to_aa[stop_codon] = "*"
            
        # Build the reverse dictionary. BioPython reverse dictionary includes just one 
        # codon per aa. Fix this.
        self.aa_to_codon = {}
        for codon, aa in self.codon_to_aa.items():
            if aa in self.aa_to_codon:
                self.aa_to_codon[aa].append(codon)
            else:
                self.aa_to_codon[aa] = [codon]

We need global variables that define the bounds on our testing. Everything is randomly produced, so we need to set ranges on our sampling:

In [4]:
# Import evSeq globals needed for testing
from evSeq.util.globals import (BARCODE_LENGTH, 
                                ADAPTER_F,
                                ADAPTER_LENGTH_F,
                                ADAPTER_R,
                                ADAPTER_LENGTH_R)

# Reference sequence bounds
MIN_REFSEQ_LEN = 150
MAX_REFSEQ_LEN = 700

# Bounds on readlength
MIN_READLENGTH = 150
MAX_READLENGTH = 300

# Bounds on quality cutoffs
MIN_BP_QUAL_CUTOFF = 15
MAX_BP_QUAL_CUTOFF = 35

MIN_GLOBAL_QUAL_CUTOFF = 15
MAX_GLOBAL_QUAL_CUTOFF = 35

# Bounds on sequence length cutoffs
MIN_SEQLEN_CUTOFF = 0.0
MAX_SEQLEN_CUTOFF = 1.0

# Bounds on number of indels added
MIN_INDELS_ADDED = 1
MAX_INDELS_ADDED = 3

# Bounds on primer lengths
PRIMER_MIN_LEN = 10
PRIMER_MAX_LEN = 40

# Bounds on the frameshift for a reference seq
FRAMESHIFT_MIN = 0
FRAMESHIFT_MAX = 3

# Codon table and allowed amino acid characters
CODON_TABLE = CustomCodonTable()
ALLOWED_AAS = tuple(sorted(list(CODON_TABLE.aa_to_codon.keys())))
INT_TO_AA = dict(enumerate(ALLOWED_AAS))
N_AAS = len(ALLOWED_AAS)

# Random number generators
RANDOM_SEED = sum(ord(char) for char in "PDawg")
NP_RNG = np.random.default_rng(RANDOM_SEED)
RANDOM_RNG = random.Random(RANDOM_SEED)

# Barcode file
INDEX_DF = pd.read_csv("../evSeq/util/index_map.csv")
N_INDICES = len(INDEX_DF)
N_PLATES = len(INDEX_DF.IndexPlate.unique())

We will store basic variables used for building test data in a configuration class, defined below:

In [14]:
class FakeRefseq():
    def __init__(self):
        
        # Randomly create an amino acid reference sequence. Record the length.
        self.refseq_len = NP_RNG.integers(MIN_REFSEQ_LEN, MAX_REFSEQ_LEN)
        aa_ints = NP_RNG.choice(N_AAS, size = self.refseq_len)
        self.aa_refseq = [INT_TO_AA[aa_int] for aa_int in aa_ints]

        # Assign a codon to each amino acid.
        self.codon_refseq = [RANDOM_RNG.choice(CODON_TABLE.aa_to_codon[aa]) 
                             for i, aa in enumerate(self.aa_refseq)] 
        
        # Decide on the frameshift of the reference sequence
        self.frameshift_front = NP_RNG.integers(FRAMESHIFT_MIN, FRAMESHIFT_MAX)
        self.frameshift_back = NP_RNG.integers(FRAMESHIFT_MIN, FRAMESHIFT_MAX)        
        
        # Assign a length to the codon reference sequence
        self.codon_refseq_len = (self.refseq_len * 3 + 
                                 self.frameshift_front +
                                 self.frameshift_back)
        
        # Assign the primer seed length
        self.primer_seed_len = NP_RNG.integers(PRIMER_MIN_LEN, PRIMER_MAX_LEN)
    
    def define_refseq_windows(self, readlength):
        """
        Defines the primers for the refseq as well as the variable window
        """
        # Choose legal positions for making mutations. This is the last amino acid
        # captured in full by the readlength, considering the primer binding region,
        # the adapter machinery, and the barcode.
        effective_readlength = readlength - BARCODE_LENGTH
        max_readable_aa_ind_f = (effective_readlength - self.frameshift_front - ADAPTER_LENGTH_F) // 3
        min_readable_aa_ind_r = (effective_readlength - self.frameshift_back - ADAPTER_LENGTH_R) // 3
        
        aa_refseq_len = len(self.aa_refseq)
        mutable_aa_inds = (list(range(0, max_readable_aa_ind_f)) + 
                           list(range(aa_refseq_len - min_readable_aa_ind_r, aa_refseq_len)))
        
        # Get only unique mutable inds
        self.mutable_aa_inds = np.array(list(set(mutable_aa_inds)), dtype = int)
        self.mutable_aa_inds.sort()
        
# Class that holds parameters that will be needed by evSeq
class Config():
    def __init__(self, detailed = True):
        """
        Builds a set of conditions that might be passed into an evSeq run.
        """
        # Build as many reference sequences as needed
        n_refs_needed = N_INDICES if detailed else N_PLATES
        self.refseqs = [FakeRefseq() for _ in range(n_refs_needed)] 
        
        # Decide on the readlength that we will be using. 
        min_refseq_len = min(refseq.codon_refseq_len for refseq in self.refseqs)                             
        min_readlength = min(MIN_READLENGTH, min_refseq_len)
        self.readlength = NP_RNG.integers(min_readlength, MAX_READLENGTH)
        
        # Define the allowed mutagenesis windows for the amino acids. This is the 
        # last amino acid captured in full by the readlength
        for refseq in self.refseqs:
            refseq.define_refseq_windows(self.readlength)
        
        # Decide on the input variables that will define the run, including 
        # average_q_cutoff, bp_q_cutoff and length_cutoff
        self.average_q_cutoff = NP_RNG.integers(MIN_GLOBAL_QUAL_CUTOFF, MAX_GLOBAL_QUAL_CUTOFF)
        self.bp_q_cutoff = NP_RNG.integers(MIN_BP_QUAL_CUTOFF, MAX_BP_QUAL_CUTOFF)
        self.length_cutoff = NP_RNG.uniform(MIN_SEQLEN_CUTOFF, MAX_SEQLEN_CUTOFF)

In [17]:
test_config = Config()

The basic data generator is a FakeWell instance. This holds methods for building fake data from the Config object.

In [None]:
# Class that holds information for a test well
class FakeWell(FakeData):
    def __init__(self, reference_sequence):
    
        # Assign a reference sequence object as an instance variable
        self.refseq = reference_sequence
        
        # Create a barcode variable as a placeholder. This will be filled when the well
        # is passed with others to a FakeRun instance.
        self.f_barcode = None
        self.r_barcode = None
        
    def choose_variable_positions(self):
        pass
        # Decide on the positions that will be varied within the sequence within the bounds
        # chosen during the initialization of the class.
        
        # Create a "NNN" version of the reference sequence for use in processing.
        # Note that we don't need to capture all positions -- we want evSeq to
        # handle finding some of them sometimes
        
    def choose_variants(self):
        pass
        # Decide how many variants we want in the well
        
        # Choose variable amino acids.
        
        # Choose a random codon used for encoding amino acids
        
    def build_perfect_reads(self):
        """
        Assigns a `perfect_reads` variable to the instance.
        """
        pass
        # Decide on the ratio between the different variants
        
        # Get the total sequences we want at each **position**
        
        # Build a set of perfect reads up to the target numbers
        
    def build_corrupted_reads(self):
        """
        Assigns `corrupted_reads` and `corrupted_bases` variables to the instance.
        """
        pass
        # Choose how many sequences to add of each flavor.
        
        # Add sequences filtered out by bp_q_cutoff. This is `corrupted_bases`
        
        # Add indels. These will go into `corrupted_bases`. We need to record the positions 
        # where indels are found.
        
        # Add sequences filtered out by the average_q_cutoff. This is `corrupted_reads`.
        
        # Add sequences filtered out by the length_cutoff. This is added to `corrupted_reads`.
                
                        
    def pad_for_frameshift(self):
        pass
        # Adds bases to the beginning and end of the sequences to cause frameshifts.
        # These bases will be analyzed by the software, but should not affect AA indices.
        
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        pass

All FakeWell instances are held within a FakeRun instance that assigns barcodes to the fake wells, handles fake input generation, and builds expected outputs.

In [5]:
# Class that holds information for a test run
def FakeRun(FakeData):
    def __init__(self, fakewells, detailed = True):
        """
        fakewells: A list of fully prepared FakeWell objects.
        detailed: Whether or not we are using a detailed refseq file. 
        """
        # Assign fakewells as an instance variable
        pass
    
    def build_input(self):
        pass
        # Assign a barcode combo to each of the fakewells
        
        # Build fastq files from the fakewells
        
        # Return the fastq files ready for processing
        
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
        
    def build_filtered_fastqs(self):
        """
        Builds the ParsedFilteredFastqs files for the different wells.
        """
        pass