This is for stress-testing the evSeq code by passing in random inputs with known expected output. We then check to see if the evSeq outputs match the expected.

In [1]:
import warnings
import os
import subprocess
import pandas as pd
import numpy as np
import math
import itertools
import shutil
import random
from glob import glob

import tests.data_generation.globals as test_glob

from tests.data_generation.globals import INDEX_DF, REFSEQ_COL_NAMES, SAVELOC, DECOUPLED_AA_COL_NAMES

from tests.data_generation.well_generator import FakeWell
from tests.data_generation.config_generator import Config

Now build the fakerun container:

In [2]:
# Calculates expected counts for a parent well
def calculate_parent_counts(well):
    
    # Get the complete expected counts across the readable region
    total_counts = np.zeros(well.refseq.refseq_len)
    for variant in well.variants:
        total_counts += variant.expected_aa_counts    
    
    # Get the number of codons added due to the length of the forward and
    # reverse primers/frameshifts
    added_len_f = well.refseq.primer_seed_len_f + well.refseq.frameshift_front
    added_len_r = well.refseq.primer_seed_len_r + well.refseq.frameshift_back
    n_added_codons = added_len_f // 3 + added_len_r // 3    
        
    # Get total counts
    usual_counts = np.sum(total_counts[well.refseq.og_mutable])
    added_codon_counts = n_added_codons * sum(variant.total_counts for variant in well.variants)
    total_counts = usual_counts + added_codon_counts
    
    # Get the mean number of counts
    return int(total_counts / (n_added_codons + len(well.refseq.og_mutable)))

def check_well_is_parent(well, all_mutated_positions, nnn_positions):
    
    # If there are nnn positions, this cannot be a parent seq
    if len(nnn_positions) > 0:
        return False
    
    # If there are no mutated positions, this is a parent well
    if len(all_mutated_positions) == 0:
        return True
        
    # Get the reference sequence for the variants
    parent_seq = well.refseq.aa_refseq

    # Loop over all variants and check to see if all mutated
    # positions match the parent
    parent_checks = [all(variant.base_mut_aa_seq[pos] == parent_seq[pos]
                         for pos in all_mutated_positions)
                    for variant in well.variants]
    
    # If all variants are parent, this is a parent well
    if all(parent_checks):
        return True
    
    # Not a parent if nothing else was triggered
    return False    

# Class that holds information for a test run
class FakeRun():
    def __init__(self, detailed = True):
        """
        fakewells: A list of fully prepared FakeWell objects.
        detailed: Whether or not we are using a detailed refseq file. 
        """
        # Store instance variables
        self.detailed = detailed
        
        # Set a random configuration
        self.config = Config(detailed = detailed)
        
        # Build wells
        self._build_wells()
        
    def build_fastq(self):
        
        # Loop over all wells. Generate reads
        forward_reads = []
        reverse_reads = []
        for well in self.wells:
            
            # Produce reads if this is not a dud
            if not well.dud_well:
                f_well_reads, r_well_reads = well.build_all_reads()
                forward_reads.extend(f_well_reads)
                reverse_reads.extend(r_well_reads)
        
        # Return the fastq files ready for processing
        with open(self.r1_saveloc, "w") as f:
            f.write("\n".join(forward_reads))
        with open(self.r2_saveloc, "w") as f:
            f.write("\n".join(reverse_reads))
            
    def build_refseq(self, include_nnn):
        
        # Record the latest status of including NNN
        self.include_nnn = include_nnn
        
        # Loop over all wells. Only consider the first well of each
        # plate if not using detailed refseqs. Otherwise, consider
        # all wells
        n_refs = len(self.config.refseqs)
        observed_plates = set()
        refseq_output = [None] * n_refs
        counter = 0
        for well in self.wells:
                        
            # Create an entry for all wells if we are running in detailed mode
            # or if we have not observed the plate before
            if self.detailed or (well.platename not in observed_plates):
                
                # Add to the output
                refseq_output[counter] = well.build_refseq_entry(include_nnn)
                
                # Increment the counter and record the observed plate
                observed_plates.add(well.platename)
                counter += 1
        
        # Create a dataframe of the entries
        refseq_df = pd.DataFrame(refseq_output,
                                 columns = REFSEQ_COL_NAMES)
        
        # Sort the dataframe to be plate then row
        refseq_df.sort_values(by = [REFSEQ_COL_NAMES[1], 
                                    REFSEQ_COL_NAMES[2]],
                              inplace = True)
        
        # Drop the well column if this is not detailed
        if not self.detailed:
            refseq_df.drop(columns = ["Well"], inplace = True)
        
        # Save df 
        refseq_df.to_csv(self.refseq_saveloc, index = False)
        
    def _build_wells(self):
        
        # Build wells
        self.wells = [None] * len(INDEX_DF)
        unique_plates_found = set()
        refseq_counter = -1
        for i, row in enumerate(INDEX_DF.itertuples()):
            
            # Add only if detailed or we have not seen the plate before
            if self.detailed or (row.IndexPlate not in unique_plates_found):
                refseq_counter += 1
                unique_plates_found.add(row.IndexPlate)
        
            # Create a new well
            well = FakeWell(self.config,
                            self.config.refseqs[refseq_counter])
            
            # Assign information to the well
            well.platename = row.IndexPlate
            well.wellname = row.Well
            well.f_barcode = row.FBC
            well.r_barcode = row.RBC
            
            # Record
            self.wells[i] = well
            
    def build_output_counts(self):
        """
        Builds output files for the different `OutputCounts`
        """
        pass
    
    def run_evseq(self):
        """
        Wraps the evSeq command line interface to run the program with
        the provided fastq and refseq files
        """
        # Write the command for the subprocess. We save all outputs as this 
        # is for debugging
        command = [
            "evSeq",
            self.refseq_saveloc,
            SAVELOC,
            "--output", SAVELOC,
            "--return_alignments",
            "--keep_parsed_fastqs",
            "--average_q_cutoff", str(self.config.average_q_cutoff),
            "--bp_q_cutoff", str(self.config.bp_q_cutoff),
            "--length_cutoff", str(self.config.length_cutoff),
            "--variable_thresh", str(self.config.variable_thresh),
            "--variable_count", str(self.config.variable_count)
        ]
        
        # Add a flag for whether this is a detailed run or not
        if self.detailed:
            command.append("--detailed_refseq")
            
        # Run the command. Fail if errors are thrown.
        with open(self.test_stdout,"wb") as out, open(self.test_stderr,"wb") as err:      
            subprocess.run(command, 
                           check = True,
                           stdout = out,
                           stderr = err)
            
    def gather_expected_positions(self):

        # Extract positions we expect to be variable. This is given by the "NNN" 
        # sequence. If we are running a detailed test with NNN, then this is new for every
        # sequence; if not detailed, it is the refseq of the first sequence. If we are
        # not including NNN, then there is no expected variation
        if self.include_nnn:
            if self.detailed:
                expected_nnn_positions = {(well.platename, well.wellname): set(well.variants[0].mutated_positions)
                                          if not well.dud_well else set() for well in self.wells }
            else:
                expected_nnn_positions = {well.platename: set(well.variants[0].mutated_positions)
                                          if not well.dud_well and well.wellname == "A01"
                                          else set()
                                          for well in self.wells if well.wellname == "A01"}        
        else:
            if self.detailed:
                expected_nnn_positions = {(well.platename, well.wellname): set() for well in self.wells}
            else:
                expected_nnn_positions = {well.platename: set() for well in self.wells}

        # Extract all other positions where we added variability.
        expected_mut_positions = {}
        for well in self.wells:

            # Skip dud wells
            if well.dud_well:
                continue

            # Get all the mutated positions
            all_expected = set(np.concatenate([variant.mutated_positions for
                                               variant in well.variants]))

            # Get the nnn positions for the well
            platewell_key = (well.platename, well.wellname)
            nnn_key = platewell_key if self.detailed else well.platename
            well_nnn_pos = expected_nnn_positions[nnn_key] if self.include_nnn else set()

            # Record all positions that are not already captured by "nnn"
            expected_mut_positions[platewell_key] = all_expected - well_nnn_pos
            
        return expected_nnn_positions, expected_mut_positions
    
    def build_expected_decoupled_aa(self):
    
        # Get the expected mutated positions
        expected_nnn_positions, expected_mut_positions = self.gather_expected_positions()    

        # Create a list for storing expected results
        expected_decoupled_aa = []

        # Loop over all wells
        for well in self.wells:

            # Skip dud wells
            if well.dud_well:
                continue

            # Gather expected mutation positions
            mut_key = (well.platename, well.wellname)
            nnn_key = mut_key if self.detailed else well.platename
            nnn_positions = expected_nnn_positions[nnn_key]
            mut_positions = expected_mut_positions[mut_key]
            all_positions = nnn_positions | mut_positions
            
            # If there are no positions OR the positions all give
            # the parent amino acid, this is a parent well
            if check_well_is_parent(well, all_positions, nnn_positions):
                    
                # Record results
                expected_decoupled_aa.extend([[
                    well.platename,
                    well.platenickname,
                    well.wellname,
                    "#PARENT#",
                    "#PARENT#",
                    1.0, # We expect 100% alignment frequency given how we wrote the test code
                    calculate_parent_counts(well),
                    "#PARENT#",                    
                ]])
                continue

            # Loop over all positions and variants. For each mutated position, record the 
            # counts, the AA identity, the position, and any flags.
            n_variants = len(well.variants)
            for pos in all_positions:

                # Create output containers
                aa_to_count = {}

                # Determine position-uniform traits
                new_aa_pos = well.refseq.aa_ind_start + pos
                flag = "Unexpected Variation" if pos not in nnn_positions else np.nan

                # Loop over all variants
                for variant_ind, variant in enumerate(well.variants):

                    # Gather the AA id at the mutated position along with the expected 
                    # counts
                    mut_aa = variant.base_mut_aa_seq[pos]
                    if mut_aa not in aa_to_count:
                        aa_to_count[mut_aa] = variant.expected_aa_counts[pos]
                    else:
                        aa_to_count[mut_aa] += variant.expected_aa_counts[pos]

                # Get the frequency of all counts
                total_count = sum(aa_to_count.values())

                # If the only aa is the same as the parent and this is not a "NNN"
                # position, continue. There is no entry as evSeq will not find it.
                if (
                    (len(aa_to_count) == 1) and
                    (pos not in nnn_positions) and
                    (mut_aa == well.refseq.aa_refseq[pos])
                ):
                    continue

                # Record results
                position_res = [
                    [
                        well.platename,
                        well.platenickname,
                        well.wellname,
                        str(new_aa_pos),
                        aa,
                        spec_count / total_count,
                        total_count,
                        flag
                    ]
                    for aa, spec_count in aa_to_count.items()
                ]
                expected_decoupled_aa.extend(position_res)
                
        # Format the output
        expected_aa_df = pd.DataFrame(expected_decoupled_aa,
                                      columns = DECOUPLED_AA_COL_NAMES)

        # Sort results
        expected_aa_df.sort_values(by = ["IndexPlate", "Well", "AaPosition", "Aa"],
                                  inplace = True)

        return expected_aa_df

    @property
    def refseq_saveloc(self):
        return os.path.join(SAVELOC, "testinput_refseq.csv")
    
    @property
    def r1_saveloc(self):
        return os.path.join(SAVELOC, "testinput_R1_allreads.fastq")
    
    @property
    def r2_saveloc(self):
        return os.path.join(SAVELOC, "testinput_R2_allreads.fastq")
    
    @property
    def test_stdout(self):
        return os.path.join(SAVELOC, "test_stdout.txt")
    
    @property
    def test_stderr(self):
        return os.path.join(SAVELOC, "test_stderr.txt")

In [3]:
def test_decoupled_aa(expected_out, true_out):
        
    # Go row by row and make sure the two dataframe align
    for expected_row, true_row in itertools.zip_longest(expected_out.itertuples(index = False),
                                                        true_out.itertuples(index = False)):

        # Convert to dicts
        expected_row = expected_row._asdict()
        true_row = true_row._asdict()

        # Check if the rows are equal.
        row_passes = True
        for key, true_val in true_row.items():

            # Get the expected value
            expected_val = expected_row[key]

            # Special cases 
            if key == "AlignmentFrequency":
                is_equiv = (np.isclose(true_val, expected_val))
            elif key == "Flags" and (true_val is np.nan):
                is_equiv = (true_val is expected_val)
            elif key == "AaPosition":
                is_equiv = (str(true_val) == str(expected_val))
            else:
                is_equiv = (true_val == expected_val)

            # Break the loop if the true and expected values are not equivalent
            if not is_equiv:
                row_passes = False
                break

        # Break the loop if the rows are not equivalent
        if not row_passes:
            print("FAILURE FOUND")
            return False, expected_out, true_out, expected_row, true_row

    # If we get to this point and row passes, we are successful
    if row_passes:
        print("SUCCESS!!!!")
        return True, expected_out, true_out, expected_row, true_row

In [4]:
def run_evseq_stress_test(detailed, include_nnn):
    
    # Run until we break something
    counter = -1
    while True:
        
        # Update the counter
        counter += 1
        
        # Update the global RNG to match the counter (for reproducbility)
        test_glob.RANDOM_SEED = counter
        test_glob.NP_RNG = np.random.default_rng(counter)
        test_glob.RANDOM_RNG = random.Random(counter)
    
        # Build a test run and the associated output files
        test_run = FakeRun(detailed = detailed)
        test_run.build_fastq()
        test_run.build_refseq(include_nnn)

        # Run evSeq on the generated data
        test_run.run_evseq()

        # Get the expected outputs
        expected_out = test_run.build_expected_decoupled_aa()

        # Get the true outputs. Sort the true output in the same
        # way the expected was sorted.
        most_recent_run_path = sorted(glob(os.path.join(SAVELOC, "evSeqOutput", "*")))[-1]
        true_out = pd.read_csv(os.path.join(most_recent_run_path, "OutputCounts", 
                                            "AminoAcids_Decoupled_All.csv"))
        true_out.sort_values(by = ["IndexPlate", "Well", "AaPosition", "Aa"],
                             inplace = True)

        # Test the two dataframes to make sure they agree
        test_output = test_decoupled_aa(expected_out, true_out)
        
        # If we pass, delete all constructed data (saves on memory)
        if test_output[0]:
            shutil.rmtree(most_recent_run_path)            
        
        # Break the loop if we don't pass. Return the test run
        else:
            return test_run, *test_output, counter

In [5]:
broken_run = run_evseq_stress_test(False, False)

SUCCESS!!!!
SUCCESS!!!!
SUCCESS!!!!
SUCCESS!!!!
SUCCESS!!!!
FAILURE FOUND


To do:
1. Write code that will allow continuation past a failure. Just record the well that is the problem and pull all alignments, counts, etc. for it.

In [6]:
broken_run[0].config.length_cutoff

0.20973973040727345

In [7]:
broken_run[0].config.average_q_cutoff

20

In [8]:
broken_run[-3]

{'IndexPlate': 'DI02',
 'Plate': 'TestPlate02',
 'Well': 'E04',
 'AaPosition': '1383',
 'Aa': 'A',
 'AlignmentFrequency': 0.3050847457627119,
 'WellSeqDepth': 59,
 'Flags': 'Unexpected Variation'}

In [9]:
broken_run[-2]

{'IndexPlate': 'DI02',
 'Plate': 'TestPlate02',
 'Well': 'E04',
 'AaPosition': '1383',
 'Aa': 'A',
 'AlignmentFrequency': 0.4186046511627907,
 'WellSeqDepth': 43,
 'Flags': 'Unexpected Variation'}

In [12]:
np.mean([ord(char) - 33 for char in  """H7;<8>;<E@B<H@CHH5HGA;97<@HH:88@>GH?G5BEIC8A?8CA5FAE585<D>88D:95<AE=9?=:C778@=H>:><E<HCG@C88H>F665;D9>EI>7CCF>D<D=?8=C<@65AI77E>7=<<C7I;I@GC@BG597;:I5G>=55>8@678E=58<6@E>E8<AFIE?86;=G<>E:8@B:7>@"""])

29.298969072164947

In [13]:
len("GTTGGAGCACCCAAGACCACTCTCCGGATACTGGCCGCTGCGGCCGTATAAAAGGGATAATTGACATAGGGAGGATCTTTGTGCATGTGTGTGACCGTTCGACACAAAATACGGCGCGCGCCCAGGGGCTATTATCTTGTAATTATGGATCCTAAATCTACGTTGGACTCGAACGCGAATACGACCCAGCCTAA")

194

In [16]:
broken_run[0].config.readlength * 0.9

183.6

In [8]:
broken_run[-3]

{'IndexPlate': 'DI01',
 'Plate': 'TestPlate01',
 'Well': 'C07',
 'AaPosition': '7961',
 'Aa': 'I',
 'AlignmentFrequency': 0.8181818181818182,
 'WellSeqDepth': 154,
 'Flags': 'Unexpected Variation'}

In [9]:
broken_run[-2]

{'IndexPlate': 'DI01',
 'Plate': 'TestPlate01',
 'Well': 'C07',
 'AaPosition': '7961',
 'Aa': 'I',
 'AlignmentFrequency': 0.8181818181818182,
 'WellSeqDepth': 77,
 'Flags': 'No usable forward alignments. -- Unexpected Variation'}

In [13]:
broken_run[0].wells[46].variants[0].f_quals

array([[33, 32, 30, ..., 25, 33, 35],
       [33, 32, 30, ..., 25, 33, 35],
       [33, 32, 30, ..., 25, 33, 35],
       ...,
       [33, 32, 30, ..., 25, 33, 35],
       [33, 32, 30, ..., 25, 33, 35],
       [33, 32, 30, ..., 25, 33, 35]])

In [14]:
broken_run[0].wells[44].variants[1].total_counts

19

In [None]:
broken_run