In [2]:
# Import deSeq globals
from ..code.globals import BARCODE_LENGTH, ADAPTER_LENGTH_F, ADAPTER_LENGTH_R

# Import required modules
import numpy as np
import pandas as pd
from string import Template
from random import shuffle

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

ImportError: attempted relative import with no known parent package

In [84]:
# Define the allowed bases
allowed_bases_tup = ("A", "T", "C", "G")
allowed_bases_set = set(allowed_bases_tup)

# Define potential "in frame" indices and allowed
# indices for making mutations within a codon
allowed_in_frame = (0, 1, 2)
allowed_n_mutations = (1, 2, 3)

# Define the frequency with which no mutations are made
wt_freq = 0.05

# Define the allowed lengths of reference sequences
max_ref_length = 601

# Define the allowed number of reads in a well
allowed_n_reads = (0, 101)

# Determine how many variants are allowed in a well
allowed_n_variants = (1, 4)

# Determine the frequency with which we have dead wells
dead_well_freq = 0.05

# Define the probability that any given base is bad
bad_base_freq = 0.05

# Load the barcode df
barcode_df = pd.read_csv("./Code/IndexMap.csv")

# Define a forward ID template
id_template = Template("@M06418:33:000000000-CRL6Y:1:1101:0:$num 1:N:0:162")

# Define the forward and reverse adapters
F_ADAPTER = "CACCCAAGACCACTCTCCGG"
R_ADAPTER = "CGGTGTGCGAAGTAGGTGC"

# Define the frequencies with which we ruin forward and reverse reads with
# different strategies
indel_freq = 0.01
low_qual_freq = 0.01

This notebook is used to build test cases for the new deSeq pipeline. The main things to test are:
1. The outputs of files are as we expect.
2. Everything works stably.

We will build both general test conditions as well as make a stress test which randomly generates input conditions and records instances of failure.

Conditions in particular to look at:
1. How well does deSeq handle (0) imperfectly overlapping reads (e.g. reads go into the barcode regions) (1) perfectly overlapping reads (2) overlapping reads (3) perfectly non-overlapping (where the forward ends just before reverse starts) and (4) non-overlapping.


For a given well and set of deSeq parameters...
1. Randomly decide which positions are mutated
2. Randomly decide on a count and frequency for the full combination
3. Decide if the well is wild type (if the combination is wild type) or dead (if the count is below our expected frequency)
4. Build reads with quality scores, lengths, etc. that will generate the target output for the combination
    1. Consider all quality filters at this stage! The final product should be a combo of all of them so that we can test everything together.
    2. Make sure you mix up whether just the forward read is bad, just the reverse read is bad, etc.
5. Count the expected individual read counts and calculate a frequency 
6. Spike in random reads with insertions and deletions across the sequence

Repeat the above for a series of wells to make up a fake dataset. 

In [3]:
# Define run-specific parameters
average_q_cutoff = 25
bp_q_cutoff = 30
variable_count = 10
read_length = 150


In [4]:


def build_mutant_sequence():
    pass

In [94]:
def mutate_codon(parent_codon, mut_inds, allowed_mutations):
    
    # Define the new codon
    new_codon = list(parent_codon)
    new_allowed = [None] * len(mut_inds)
    
    # Loop over mutated positions
    for i, ind in enumerate(mut_inds):

        # Choose the new base
        new_base = np.random.choice(tuple(allowed_mutations[i]))
        
        # Update the codon
        new_codon[ind] = new_base
        
        # Update the allowed mutations
        new_allowed[i] = allowed_mutations[i] - set(new_base)

    # Return the new codon joined as a string and the updated
    # set of allowed codons
    return "".join(new_codon), new_allowed

# Write functions that break up the larger procedure below
def find_start_end_muts(nub_seq, nub_len):
    
    # Get the number of mutations in the starting and ending sequence
    nub_inds = np.arange(nub_len)
    n_nub_muts = np.random.choice(nub_inds + 1)

    # Get the positions of mutations in the starting and ending sequence
    nub_mut_pos = np.sort(np.random.choice(nub_inds, size = n_nub_muts, replace = False))

    # Get the allowed starting mutations
    allowed_nub_muts = [allowed_bases_set - set(nub_seq[mut_pos]) for mut_pos in nub_mut_pos]
    
    # Return needed info
    return nub_mut_pos, allowed_nub_muts    

class WellContents():
    
    # Initialize
    def __init__(self, ref_seq = None, ref_length = None, read_length = 150):
        
        # Randomly generate a reference sequence
        if ref_length == None:
            self.ref_length = np.random.randint(read_length - 26, max_ref_length)
        else:
            self.ref_length = ref_length
        if ref_seq == None:
            self.ref_seq = "".join(np.random.choice(allowed_bases_tup, size = self.ref_length))
        else:
            self.ref_seq = ref_seq

        # Assign what is in frame and split into codons
        self.in_frame = np.random.choice(allowed_in_frame)
        chunk_inds = [np.arange(i, i+3) for i in range(self.in_frame, self.ref_length, 3)]
        all_chunks = [self.ref_seq[i:i+3] for i in range(self.in_frame, self.ref_length, 3)]
        self.codon_bp_inds = [chunk for chunk in chunk_inds if len(chunk) == 3]
        self.all_codons = [chunk for chunk in all_chunks if len(chunk) == 3]
        self.n_codons = len(self.all_codons)
        self.codon_inds = np.arange(self.n_codons)
            
        # Get the amino acid sequences
        joined_readable = "".join(self.all_codons)
        self.aa_ref_seq = str(Seq(joined_readable).translate())

        # Map basepair to codon
        readable_bases = len(joined_readable)
        self.bp_to_codon = {ind: (ind - self.in_frame)//3 for ind in range(self.in_frame, self.in_frame + readable_bases)}
        
        # Map codon to basepair
        self.codon_to_bp = {}
        bp_counter = self.in_frame
        for codon_ind, codon in enumerate(self.all_codons):
            self.codon_to_bp[codon_ind] = np.array([bp_counter, bp_counter + 1, bp_counter + 2])
            bp_counter += 3
        
        # Find the starting and ending sequence nubs
        self.start_seq = self.ref_seq[:self.in_frame]
        self.start_len = len(self.start_seq)
        self.end_seq = all_chunks[-1] if len(all_chunks[-1]) != 3 else ""
        self.end_len = len(self.end_seq)

        # Build variants and make the mutant sequences
        self.build_variants()
        self.build_mutant_seqs()
        
        # Get the count array for a perfect sequence
        self.read_length = read_length
        self.get_readable_regions()
        self.get_perfect_count_array()
        
        # Find the variable positions
        self.find_variable_positions()
                
    def build_variants(self):
        
        # Determine how many variants you want in a well
        self.n_variants = np.random.randint(*allowed_n_variants)

        # Choose the number of mutated positions, the number of mutations
        # in each codon, and where in the codon the mutation takes place
        n_pos_mutated = np.random.randint(1, int(self.n_codons * 0.1)) # Total number of mutations
        self.codon_mutation_positions = np.sort(np.random.choice(self.codon_inds, size = n_pos_mutated, replace = False)) # Codons to mutate
        all_n_mutations = np.random.choice(allowed_n_mutations, size = n_pos_mutated) # Number of mutations at each codon
        self.all_codon_mut_pos = [np.sort(np.random.choice(allowed_in_frame, size = n_mutations, replace = False))
                                  for n_mutations in all_n_mutations]

        # Create a list of allowed mutations at each position
        allowed_mutations = [None] * n_pos_mutated
        for i, mutation_pos in enumerate(self.codon_mutation_positions):
            allowed_mutations[i] = [allowed_bases_set - set(self.all_codons[mutation_pos][mut_base_ind])
                                   for mut_base_ind in self.all_codon_mut_pos[i]]

        # Check to see if we want to mutate start positions
        if self.start_len > 0:
             self.start_mut_pos, allowed_start_muts = find_start_end_muts(self.start_seq, self.start_len)

        # Check to see if we want to mutate end positions
        if self.end_len > 0:
            self.end_mut_pos, allowed_end_muts = find_start_end_muts(self.end_seq, self.end_len)

        # Loop over the number of variants and generate new sequences
        self.new_codons = [""] * self.n_variants
        self.new_starts = [""] * self.n_variants
        self.new_ends = [""] * self.n_variants
        for variant_ind in range(self.n_variants):

            # Mutate all codons
            new_codon_list = self.all_codons.copy()
            for i, (mut_pos, codon_mut_pos, codon_allowed_mutations) in enumerate(zip(self.codon_mutation_positions,
                                                                                      self.all_codon_mut_pos,
                                                                                      allowed_mutations)):

                # Mutate the codon
                new_codon, new_allowed = mutate_codon(new_codon_list[mut_pos], codon_mut_pos, codon_allowed_mutations)

                # Update the new sequence and new allowed
                new_codon_list[mut_pos] = new_codon
                allowed_mutations[i] = new_allowed

                # Update the list of new sequences
                self.new_codons[variant_ind] = new_codon_list

            # Check to see if we want to mutate the starting sequence. We will only
            # mutate if there is one
            if self.start_len > 0:
                self.new_starts[variant_ind], allowed_start_muts = mutate_codon(self.start_seq,
                                                                                self.start_mut_pos,
                                                                                allowed_start_muts)

            # Check to see if we want to mutate the end sequence. We will only
            # mutate if there is one
            if self.end_len > 0:
                self.new_ends[variant_ind], allowed_end_muts = mutate_codon(self.end_seq,
                                                                            self.end_mut_pos,
                                                                            allowed_end_muts)
    
    # Build sequences from the mutants
    def build_mutant_seqs(self):
        
        # Define lists for holding the new sequences
        self.mutant_aas = [None] * self.n_variants
        self.mutant_seqs = [None] * self.n_variants
        
        # Loop over all mutants and append to new lists
        for i, (start, mid, end) in enumerate(zip(self.new_starts, self.new_codons, self.new_ends)):

            # Get the new translatable sequence
            new_translatable = Seq("".join(mid))
            
            # Complete the mutated sequence
            self.mutant_seqs[i] = "".join([start, str(new_translatable), end])
            
            # Complete the translated sequence
            self.mutant_aas[i] = str(new_translatable.translate())
            
    def get_readable_regions(self):
        
        # Get the read lengths of the forward and reverse reads
        self.forward_readlength = self.read_length - ADAPTER_LENGTH_F - BARCODE_LENGTH
        self.reverse_readlength = self.read_length - ADAPTER_LENGTH_R - BARCODE_LENGTH
        
        # Get the basepair readable positions
        all_bp_positions = np.arange(len(self.ref_seq))
        self.forward_readable_bps = set(all_bp_positions[:self.forward_readlength])
        self.reverse_readable_bps = set(all_bp_positions[(self.ref_length - self.reverse_readlength):])
        self.readable_bp_positions = self.forward_readable_bps.union(self.reverse_readable_bps)
        
        # Get the amino acid readable positions
        self.readable_aa_positions = set(pos for pos, bps in self.codon_to_bp.items() 
                                         if all(bp in self.readable_bp_positions for bp in bps))
        
    def get_perfect_count_array(self):
        
        # Get the perfect counts for basepairs
        self.perfect_counts = np.zeros(self.ref_length, dtype = int)
        self.perfect_counts[:self.forward_readlength] += 1
        self.perfect_counts[(self.ref_length - self.reverse_readlength):] += 1
        
        # Get the perfect counts for amino acids
        n_aas = len(self.aa_ref_seq)
        self.perfect_counts_aa = np.zeros(n_aas, dtype = int)

        # Loop over the counts array and increment appropriately
        for i in range(n_aas):
            self.perfect_counts_aa[i] += np.min(self.perfect_counts[self.codon_to_bp[i]])
        
    # Identify readable variable positions
    def find_variable_positions(self):
        
        # Get the readable variable basepair positions
        self.readable_variable_bp = np.array([[i for i, (char1, char2) in 
                                               enumerate(zip(self.ref_seq, mutant_seq))
                                               if (char1 != char2 and i in self.readable_bp_positions)]
                                              for mutant_seq in self.mutant_seqs])
        
        # Find readable codons
        mutated_codons_by_seq = [[i for i, (char1, char2) in enumerate(zip(self.aa_ref_seq, mutant_seq))
                                 if (char1 != char2)] for mutant_seq in self.mutant_aas]
                
        self.readable_codons = [[codon for codon in seq if 
                                all(base_ind in self.readable_bp_positions for base_ind in self.codon_to_bp[codon])]
                               for seq in mutated_codons_by_seq]
        
#     # Attach barcodes and adapters to mutant sequences
#     def complete_sequences(self, FBC, RBC):
        
#         # Build complete sequences that can be saved as inputs
#         self.finished_seqs_f = []
#         self.finished_seqs_r = []
#         for mutant_seq in self.mutant_seqs:
            
#             # Pull readable regions of the sequences
#             forward_seq_unfinished = mutant_seq[:self.forward_readlength]
#             rev_seq_unfinished = Seq(mutant_seq[(self.ref_length - self.reverse_readlength):])
#             rev_seq_unfinished = str(rev_seq_unfinished.reverse_complement())

#             # Finish the sequences and store
#             self.finished_seqs_f.append(Seq("".join((FBC, F_ADAPTER, forward_seq_unfinished))))
#             self.finished_seqs_r.append(Seq("".join((RBC, R_ADAPTER, rev_seq_unfinished))))    
                
    # Add noise to the completed sequences. Build the outputs (but don't save) as we go along.
    def build_output(self, bp_q_cutoff, n_reads = None):
        
        # Define the allowed range of quality scores
        high_q_range = np.arange(bp_q_cutoff, 40)
        low_q_range = np.arange(1, bp_q_cutoff)

        # Decide on the number of sequences in the well
        if n_reads is None:
            self.n_seqs_in_well = np.random.randint(*allowed_n_reads)
        else:
            self.n_seqs_in_well

        # Get the number of times each sequence will be represented
        n_reps = self.n_seqs_in_well // self.n_variants
        remainder = self.n_seqs_in_well % self.n_variants
        total_counts_by_seq = np.full(self.n_variants, n_reps)
        total_counts_by_seq[0] += remainder
        
        # Loop over each sequence
        for n_seqs, bp_seq, aa_seq in zip(total_counts_by_seq, self.mutant_seqs, self.mutant_aas):
            
            # Loop over the number of sequences
            for _ in range(n_seqs):
                
                # Create booleans 
                ruin_f_indel = False
                ruin_r_indel = False
                ruin_f_qual = False
                ruin_r_qual = False
            
                # Generate a count array for the sequence
                seq_count_array = self.perfect_counts.copy()
                aa_count_array = self.perfect_counts_aa.copy()

                # Make a list for storing the new forward and reverse sequences
                new_f = []
                new_r = []

                # Decide if we want to ruin the forward sequence with an insertion or deletion
                if np.random.rand() < indel_freq:
                    ruin_f_indel = True
                    
                # Decide if we want to ruin the reverse sequence with an insertion or deletion
                if np.random.rand() < indel_freq:
                    ruin_r_indel = True

                # Decide if we want to ruin the forward sequence by dropping its quality
                if np.random.rand() < low_qual_freq:
                    ruin_f_qual = True
                    
                # Decide if we want to ruin the reverse sequence by dropping its quality
                if np.random.rand() < low_qual_freq:
                    ruin_r_qual = True

                # If the sequence is ruined by either of the above, subtract 1 from the appropriate
                # region of the count array. Note that this read won't count toward combinations.
                
                
                
                # Loop over each basepair in the sequence
                for bp in bp_seq:

                    # If this bp is not in a readable region, assign random 

                    # If the forward and reverse regions overlap, decide if we want a disagreement in 
                    # base called. If there is a disagreement, subtract 1 from the appropriate position
                    # of the count array. The reverse read will be used if there is a tie in qualities. If both 
                    # reads are less than the q-score threshold, subtract 2 from the region of the count
                    # array and note that this read won't count toward combinations.
                    # If only 1 read is less than the q-score threshold, subtract 1; the read should still 
                    # count toward combinations.

                    # If we are in non-overlapping regions and the quality is low, subtract 1 from the
                    # appropriate count position and note that this read won't count toward combinations
                
        
        
    # Attach barcodes and adapters to mutant sequences
    def complete_sequences(self, FBC, RBC):
        
        # Build complete sequences that can be saved as inputs
        self.finished_seqs_f = []
        self.finished_seqs_r = []
        for mutant_seq in self.mutant_seqs:
            
            # Pull readable regions of the sequences
            forward_seq_unfinished = mutant_seq[:self.forward_readlength]
            rev_seq_unfinished = Seq(mutant_seq[(self.ref_length - self.reverse_readlength):])
            rev_seq_unfinished = str(rev_seq_unfinished.reverse_complement())

            # Finish the sequences and store
            self.finished_seqs_f.append(Seq("".join((FBC, F_ADAPTER, forward_seq_unfinished))))
            self.finished_seqs_r.append(Seq("".join((RBC, R_ADAPTER, rev_seq_unfinished))))
        

In [95]:
test = WellContents()

In [100]:
class OutputData():
    
    # Initialize
    def __init__(self, n_seqs, perfect_counts, perfect_counts_aa):
        
        # Create lists for storing sequences
        self.f_seqs = [None] * n_seqs
        self.r_seqs = [None] * n_seqs
        
        # Create counters to store relevant results
        self.passing_bp_combos = n_seqs
        self.passing_aa_combos = n_seqs
        self.bp_counts_by_pos = perfect_counts * n_seqs
        self.aa_counts_by_pos = perfect_counts_aa * n_seqs

In [62]:
def decide_on_ruin():
    
    # Create booleans 
    ruin_f_indel = True if np.random.rand() < indel_freq else False
    ruin_r_indel = True if np.random.rand() < indel_freq else False
    ruin_f_qual = True if np.random.rand() < low_qual_freq else False
    ruin_r_qual = True if np.random.rand() < low_qual_freq else False
    
    return ruin_f_indel, ruin_r_indel, ruin_f_qual, ruin_r_qual

def enact_ruin(seq, qualities, q_thresh):
    
    # Create booleans to see if we will ruin a sequence by adding an
    # insertion or deletion, dropping its quality, or making it too short
    ruin_indel = True if np.random.rand() < indel_freq else False
    ruin_qual = True if np.random.rand() < low_qual_freq else False
    ruin_length = True if np.random.rand() < low_length_freq else False
    
    # If we aren't ruining anything, just return the sequence with a 
    # quality score
    if not any(ruin_indel, ruin_qual):
        return seq, qualities
    
    # Create a new sequence array
    seq = list(seq)
    
    # If we are ruining something, decide on fate
    if ruin_indel:
        
        # Choose a point for insertion or deletion
        indel_point = np.random.randint(2, len(seq) - 3)
        
        # Route for insertion
        if np.random.choice([True, False]):
            
            # Choose new quality and base
            new_qual = np.random.choice(1, 40)
            new_base = np.random.choice(allowed_bases_tup)
            
            # Update the sequence
            seq = list(seq)
            seq.insert(indel_point, new_base)
            seq = "".join(seq)
            
            # Update the qualities
            qualities = np.insert(qualities, indel_point)
            
        # Route for deletion
        else:
            
            # Update the sequence
            del(seq[indel_points])
            seq = "".join(seq)
            
            # Update the qualities
            qualities = np.delete(qualities, indel_point)
    
    # If we are ruining by quality, do this here
    if ruin_qual:
        
        # Take random qualities below the threshold
        qualities = np.random.randint()
        
    

# Define the allowed range of quality scores
high_q_range = np.arange(bp_q_cutoff, 40)
low_q_range = np.arange(1, bp_q_cutoff)

# Decide on the number of sequences in the well
if n_reads is None:
    test.n_seqs_in_well = np.random.randint(*allowed_n_reads)
else:
    test.n_seqs_in_well

# Get the number of times each sequence will be represented
n_reps = test.n_seqs_in_well // test.n_variants
remainder = test.n_seqs_in_well % test.n_variants
total_counts_by_seq = np.full(test.n_variants, n_reps)
total_counts_by_seq[0] += remainder

# Create a list for storing all results
all_results = []

# Loop over each sequence
for n_seqs, bp_seq, aa_seq in zip(total_counts_by_seq, test.mutant_seqs, test.mutant_aas):
    
    # Create a container for storing output data
    output_container = OutputData()
    
    # Loop over the number of sequences
    for _ in range(n_seqs):



        # If the sequence is ruined by either of the above, subtract 1 from the appropriate
        # region of the count array. Note that this read won't count toward combinations.



        # Loop over each basepair in the sequence
        for bp in bp_seq:

            # Randomly terminate early to have some sequences that are shorter than expected (but
            # that still will work)
            
            # If this bp is not in a readable region, assign random 

            # If the forward and reverse regions overlap, decide if we want a disagreement in 
            # base called. If there is a disagreement, subtract 1 from the appropriate position
            # of the count array. The reverse read will be used if there is a tie in qualities. If both 
            # reads are less than the q-score threshold, subtract 2 from the region of the count
            # array and note that this read won't count toward combinations.
            # If only 1 read is less than the q-score threshold, subtract 1; the read should still 
            # count toward combinations.

            # If we are in non-overlapping regions and the quality is low, subtract 1 from the
            # appropriate count position and note that this read won't count toward combinations

In [63]:
perfect_counts_aa

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1])

In [53]:
test.mutant_seqs

['TGCATGTATATTTCATTGCACAGAATAAGCCACCACACGTCCGTAACACACCCGCCCTTCGCCGAGGAGTTAGTTCGGAATCGAGTTGCCTTTTTGATTGCGCGCTGTCCTCTAAGCCCGCGGCAGTTATAGCTGCGACCGTACAGTACATCATATTGGCGTATACGAACGGCTGATGGATGGACTCCGATCTCCGTTCCATTTAGGTGTCTAAATAAATTGAAACATCCAATATCCGATCGATTTTGTACAAGTGGTGTAATACGCTGCGATCCTTCATGGTCTCCGTGCTGGCCACGGGCGCGTTTTCTATCTGAATACCTCGCGTGATCTAGGATTACTATGAACCACCGTCAGCGGATCGTGGTTGCCAGTTGATAGCTTGCCACTAATTCCTTGA']

In [None]:
# Decide if we want to eliminate the forward or reverse read in this count

# Start by calculating and testing outputs if no noise is added. Add noise later. Then you just adjust the ideal output based on the noise.

In [None]:
def build_output_records(average_q_cutoff = 25, bp_q_cutoff = 30,
                         variable_count = 10, read_length = 150,
                        ref_length = None):
    pass

average_q_cutoff = 25
bp_q_cutoff = 30
variable_count = 10
read_length = 150
ref_length = None

np.random.seed(2)

# Create a counter to index the input sequences. Create lists to
# store all outputs
output_counter = 0
all_out_f = []
all_out_r = []

# Build a list for storing the reference sequences
output_cols = ["PlateName", "IndexPlate", "Well", "ReferenceSequence",
              "InFrameBase", "BpIndStart", "AaIndStart"]
ref_seq_file_array = [None] * len(barcode_df)
wells = ref_seq_file_array.copy()

# Loop over the barcode plate
for barcode_ind, row in enumerate(barcode_df.itertuples(index = False)):
        
    # Create a well with specific content
    well = WellContents(ref_length = ref_length, read_length = read_length)   
    wells[barcode_ind] = well
    
    # Populate the reference sequence row
    ref_seq_file_array[barcode_ind] = [row.IndexPlate + "_TEST", row.IndexPlate, row.Well,
                                      well.ref_seq, well.in_frame + 1, 1232, 23]
    
    # Define the allowed range of quality scores
    high_q_range = np.arange(bp_q_cutoff, 40)

    # Decide on the number of sequences in the well
    n_seqs_in_well = np.random.randint(*allowed_n_reads)

    # Get the number of times each sequence will be represented
    n_reps = n_seqs_in_well // well.n_variants
    remainder = n_seqs_in_well % well.n_variants
    total_counts_by_seq = np.full(well.n_variants, n_reps)
    total_counts_by_seq[0] += remainder

    # Loop over the sequences and build both the expected output
    # and the desired input
    output_records_f = [None] * n_seqs_in_well
    output_records_r = output_records_f.copy()
    temp_counter = 0
    for total_counts, mutant_seq, mutant_aa_seq in zip(total_counts_by_seq, well.mutant_seqs, well.mutant_aas):
        
        # Build outputs
        for _ in range(total_counts):

            # Pull the forward sequence
            forward_seq_unfinished = mutant_seq[:well.forward_readlength]
            rev_seq_unfinished = Seq(mutant_seq[(well.ref_length - well.reverse_readlength):])
            rev_seq_unfinished = str(rev_seq_unfinished.reverse_complement())

            # Finish the sequences
            finished_f = Seq("".join((row.FBC, F_ADAPTER, forward_seq_unfinished)))
            finished_r = Seq("".join((row.RBC, R_ADAPTER, rev_seq_unfinished)))

            # Create sequence records
            output_records_f[temp_counter] = SeqRecord(finished_f,
                                                       id = id_template.substitute(num = output_counter),
                                                       letter_annotations = {"phred_quality": np.random.choice(high_q_range, len(finished_f))})
            output_records_r[temp_counter] = SeqRecord(finished_r,
                                                       id = id_template.substitute(num = output_counter),
                                                       letter_annotations = {"phred_quality": np.random.choice(high_q_range, len(finished_r))})

            # Update counters
            output_counter += 1 
            temp_counter += 1
            
    # Extend the overall records
    all_out_f.extend(output_records_f)
    all_out_r.extend(output_records_r)            
    
# Save results after shuffling order of output records
shuffle(all_out_f)
shuffle(all_out_r)
with open("deSeqValidation/Test_R1_001.fastq", "w") as f:
    SeqIO.write(all_out_f, f, "fastq")
with open("deSeqValidation/Test_R2_001.fastq", "w") as f:
    SeqIO.write(all_out_r, f, "fastq")
output_df = pd.DataFrame(ref_seq_file_array, columns = output_cols)
output_df.to_csv("./deSeqValidation/TestRefSeq.csv", index = False)

In [None]:
len(wells[6].codon_mutation_positions)

In [None]:
w

In [None]:
f_bc, r_bc = barcode_df.loc[(barcode_df.IndexPlate == "DI01")&(barcode_df.Well == "A01"), ["FBC", "RBC"]].values.flatten()
id_counter = 0


    
#     # Grab the mutated bases
#     mutant_bases = [[base_ind, orig_base, new_base] for 
#                     base_ind, (orig_base, new_base) in
#                     enumerate(zip(test.ref_seq, mutant_seq))
#                    if orig_base != new_base]
    
#     # Grab the mutated amino acids
#     mutant_aas = [[aa_ind, orig_aa, new_aa] for 
#                  aa_ind, (orig_aa, new_aa) in
#                  enumerate(zip(test.aa_ref_seq, mutant_aa_seq))
#                  if orig_aa != new_aa]
    
#     # Grab the expected counts for the bases
#     expected_base_counts = [test.perfect_counts[mutant_ind] for mutant_ind, _, _ in mutant_bases]
    
#     Get the expected counts


In [None]:
len(output_records_f[0].letter_annotations["phred_quality"])

In [None]:
output_records_f

In [None]:

# Loop over each variable sequence


# Create a counter for the number of sequences in a well



# Determine codons with poor quality bases that we mutated.
# bad_base_test = np.random.rand(len(test.codon_mutation_positions)) < bad_base_freq
# codons_with_bad_base = test.codon_mutation_positions[bad_base_test]


# Determine the bad bases for the variable each sequence.
# bad_base_checks = np.random.rand(n_seqs_in_well, test.ref_length) < bad_base_freq

# Decide if there are any bad bases in the sequences
# bad_base_checker = np.random.rand(n_seqs_in_well) < bad_base_in_seq_freq

# Loop over the bad base checker and decide (a) the number of bad
# positions and (b) where those positions are. 

# After sequences are split, decide if we want to tank just one of them (e.g.
# by adding insertions or deletions or dropping the q-score very low)


In [None]:
n_seqs_in_well

In [None]:
n_reps

In [None]:
total_counts_by_seq

In [None]:
codons_with_bad_base

In [None]:
# Get the overall number of reads that we want for the well
total_possible_reads = np.random.randint(*allowed_n_reads)

# Decide if the well is dead. Evaluate possible conditions to make it dead.
if random.rand() < dead_well_freq:
    
    # Choose "1", "2", or "3". "1" means make no usable reads (either by insertion, deletion
    # or low average read quality)

# Decide if we want this to be wild type 



In [None]:
# Make a kill reads function that either makes an insertion/deletion
# or else just wipes out functionality

In [None]:
# Now we need a function that...
# 1. Determines the overall count of each read
# 2. Assigns a target frequency for each position
# 3. Assigns a target count for each position
# 4. Breaks into forward and reverse reads
# 5. Loops over reads and assigns quality scores to the different bases
#

In [None]:
build_reference_and_variants()