In [1]:
# Import deSeq globals
from Code.Globals import BARCODE_LENGTH, ADAPTER_LENGTH_F, ADAPTER_LENGTH_R

# Import required modules
import numpy as np
import pandas as pd
from string import Template
from random import shuffle

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [2]:
# Define the allowed bases
allowed_bases_tup = ("A", "T", "C", "G")
allowed_bases_set = set(allowed_bases_tup)

# Define potential "in frame" indices and allowed
# indices for making mutations within a codon
allowed_in_frame = (0, 1, 2)
allowed_n_mutations = (1, 2, 3)

# Define the frequency with which no mutations are made
wt_freq = 0.05

# Define the allowed lengths of reference sequences
allowed_ref_lengths = (124, 601)

# Define the allowed number of reads in a well
allowed_n_reads = (0, 101)

# Determine how many variants are allowed in a well
allowed_n_variants = (1, 4)

# Determine the frequency with which we have dead wells
dead_well_freq = 0.05

# Define the probability that any given base is bad
bad_base_freq = 0.05

# Load the barcode df
barcode_df = pd.read_csv("./Code/IndexMap.csv")

# Define a forward ID template
id_template = Template("@M06418:33:000000000-CRL6Y:1:1101:0:$num 1:N:0:162")

# Define the forward and reverse adapters
F_ADAPTER = "CACCCAAGACCACTCTCCGG"
R_ADAPTER = "CGGTGTGCGAAGTAGGTGC"

This notebook is used to build test cases for the new deSeq pipeline. The main things to test are:
1. The outputs of files are as we expect.
2. Everything works stably.

We will build both general test conditions as well as make a stress test which randomly generates input conditions and records instances of failure.

Conditions in particular to look at:
1. How well does deSeq handle (0) imperfectly overlapping reads (e.g. reads go into the barcode regions) (1) perfectly overlapping reads (2) overlapping reads (3) perfectly non-overlapping (where the forward ends just before reverse starts) and (4) non-overlapping.


For a given well and set of deSeq parameters...
1. Randomly decide which positions are mutated
2. Randomly decide on a count and frequency for the full combination
3. Decide if the well is wild type (if the combination is wild type) or dead (if the count is below our expected frequency)
4. Build reads with quality scores, lengths, etc. that will generate the target output for the combination
    1. Consider all quality filters at this stage! The final product should be a combo of all of them so that we can test everything together.
    2. Make sure you mix up whether just the forward read is bad, just the reverse read is bad, etc.
5. Count the expected individual read counts and calculate a frequency 
6. Spike in random reads with insertions and deletions across the sequence

Repeat the above for a series of wells to make up a fake dataset. 

In [3]:
# Define run-specific parameters
average_q_cutoff = 25
bp_q_cutoff = 30
variable_count = 10
read_length = 150


In [4]:


def build_mutant_sequence():
    pass

In [5]:
def mutate_codon(parent_codon, mut_inds, allowed_mutations):
    
    # Define the new codon
    new_codon = list(parent_codon)
    new_allowed = [None] * len(mut_inds)
    
    # Loop over mutated positions
    for i, ind in enumerate(mut_inds):

        # Choose the new base
        new_base = np.random.choice(tuple(allowed_mutations[i]))
        
        # Update the codon
        new_codon[ind] = new_base
        
        # Update the allowed mutations
        new_allowed[i] = allowed_mutations[i] - set(new_base)

    # Return the new codon joined as a string and the updated
    # set of allowed codons
    return "".join(new_codon), new_allowed

# Write functions that break up the larger procedure below
def find_start_end_muts(nub_seq, nub_len):
    
    # Get the number of mutations in the starting and ending sequence
    nub_inds = np.arange(nub_len)
    n_nub_muts = np.random.choice(nub_inds + 1)

    # Get the positions of mutations in the starting and ending sequence
    nub_mut_pos = np.sort(np.random.choice(nub_inds, size = n_nub_muts, replace = False))

    # Get the allowed starting mutations
    allowed_nub_muts = [allowed_bases_set - set(nub_seq[mut_pos]) for mut_pos in nub_mut_pos]
    
    # Return needed info
    return nub_mut_pos, allowed_nub_muts    

class WellContents():
    
    # Initialize
    def __init__(self, ref_length = None, read_length = 150):
        
        # Randomly generate a reference sequence
        if ref_length == None:
            self.ref_length = np.random.randint(*allowed_ref_lengths)
        else:
            self.ref_length = ref_length
        self.ref_seq = "".join(np.random.choice(allowed_bases_tup, size = self.ref_length))

        # Assign what is in frame and split into codons
        self.in_frame = np.random.choice(allowed_in_frame)
        chunk_inds = [np.arange(i, i+3) for i in range(self.in_frame, self.ref_length, 3)]
        all_chunks = [self.ref_seq[i:i+3] for i in range(self.in_frame, self.ref_length, 3)]
        self.codon_bp_inds = [chunk for chunk in chunk_inds if len(chunk) == 3]
        self.all_codons = [chunk for chunk in all_chunks if len(chunk) == 3]
        self.n_codons = len(self.all_codons)
        self.codon_inds = np.arange(self.n_codons)
            
        # Get the amino acid sequences
        joined_readable = "".join(self.all_codons)
        self.aa_ref_seq = str(Seq(joined_readable).translate())

        # Map basepair to codon
        readable_bases = len(joined_readable)
        self.bp_to_codon = {ind: (ind - self.in_frame)//3 for ind in range(self.in_frame, self.in_frame + readable_bases)}
        
        # Find the starting and ending sequence nubs
        self.start_seq = self.ref_seq[:self.in_frame]
        self.start_len = len(self.start_seq)
        self.end_seq = all_chunks[-1] if len(all_chunks[-1]) != 3 else ""
        self.end_len = len(self.end_seq)

        # Build variants and make the mutant sequences
        self.build_variants()
        self.build_mutant_seqs()
        
        # Get the count array for a perfect sequence
        self.read_length = read_length
        self.get_perfect_count_array()
                
    def build_variants(self):
        
        # Determine how many variants you want in a well
        self.n_variants = np.random.randint(*allowed_n_variants)

        # Choose the number of mutated positions, the number of mutations
        # in each codon, and where in the codon the mutation takes place
        n_pos_mutated = np.random.randint(1, self.n_codons) # Total number of mutations
        self.codon_mutation_positions = np.sort(np.random.choice(self.codon_inds, size = n_pos_mutated, replace = False)) # Codons to mutate
        all_n_mutations = np.random.choice(allowed_n_mutations, size = n_pos_mutated) # Number of mutations at each codon
        self.all_codon_mut_pos = [np.sort(np.random.choice(allowed_in_frame, size = n_mutations, replace = False))
                                  for n_mutations in all_n_mutations]

        # Create a list of allowed mutations at each position
        allowed_mutations = [None] * n_pos_mutated
        for i, mutation_pos in enumerate(self.codon_mutation_positions):
            allowed_mutations[i] = [allowed_bases_set - set(self.all_codons[mutation_pos][mut_base_ind])
                                   for mut_base_ind in self.all_codon_mut_pos[i]]

        # Check to see if we want to mutate start positions
        if self.start_len > 0:
             self.start_mut_pos, allowed_start_muts = find_start_end_muts(self.start_seq, self.start_len)

        # Check to see if we want to mutate end positions
        if self.end_len > 0:
            self.end_mut_pos, allowed_end_muts = find_start_end_muts(self.end_seq, self.end_len)

        # Loop over the number of variants and generate new sequences
        self.new_codons = [""] * self.n_variants
        self.new_starts = [""] * self.n_variants
        self.new_ends = [""] * self.n_variants
        for variant_ind in range(self.n_variants):

            # Mutate all codons
            new_codon_list = self.all_codons.copy()
            for i, (mut_pos, codon_mut_pos, codon_allowed_mutations) in enumerate(zip(self.codon_mutation_positions,
                                                                                      self.all_codon_mut_pos,
                                                                                      allowed_mutations)):

                # Mutate the codon
                new_codon, new_allowed = mutate_codon(new_codon_list[mut_pos], codon_mut_pos, codon_allowed_mutations)

                # Update the new sequence and new allowed
                new_codon_list[mut_pos] = new_codon
                allowed_mutations[i] = new_allowed

                # Update the list of new sequences
                self.new_codons[variant_ind] = new_codon_list

            # Check to see if we want to mutate the starting sequence. We will only
            # mutate if there is one
            if self.start_len > 0:
                self.new_starts[variant_ind], allowed_start_muts = mutate_codon(self.start_seq,
                                                                                self.start_mut_pos,
                                                                                allowed_start_muts)

            # Check to see if we want to mutate the end sequence. We will only
            # mutate if there is one
            if self.end_len > 0:
                self.new_ends[variant_ind], allowed_end_muts = mutate_codon(self.end_seq,
                                                                            self.end_mut_pos,
                                                                            allowed_end_muts)
    
    # Build sequences from the mutants
    def build_mutant_seqs(self):
        
        # Define lists for holding the new sequences
        self.mutant_aas = [None] * self.n_variants
        self.mutant_seqs = [None] * self.n_variants
        
        # Loop over all mutants and append to new lists
        for i, (start, mid, end) in enumerate(zip(self.new_starts, self.new_codons, self.new_ends)):

            # Get the new translatable sequence
            new_translatable = Seq("".join(mid))
            
            # Complete the mutated sequence
            self.mutant_seqs[i] = "".join([start, str(new_translatable), end])
            
            # Complete the translated sequence
            self.mutant_aas[i] = str(new_translatable.translate())
            
    def get_perfect_count_array(self):
        
        # Get the read lengths of the forward and reverse reads
        self.forward_readlength = self.read_length - ADAPTER_LENGTH_F - BARCODE_LENGTH
        self.reverse_readlength = self.read_length - ADAPTER_LENGTH_R - BARCODE_LENGTH

        # Get the regions where the reads overlap
        self.perfect_counts = np.zeros(self.ref_length, dtype = int)
        self.perfect_counts[:self.forward_readlength] += 1
        self.perfect_counts[(self.ref_length - self.reverse_readlength):] += 1

# Start by calculating and testing outputs if no noise is added. Add noise later. Then you just adjust the ideal output based on the noise.

In [6]:
def build_output_records(average_q_cutoff = 25, bp_q_cutoff = 30,
                         variable_count = 10, read_length = 150,
                        ref_length = None):
    pass

average_q_cutoff = 25
bp_q_cutoff = 30
variable_count = 10
read_length = 150
ref_length = None

np.random.seed(2)

# Create a counter to index the input sequences. Create lists to
# store all outputs
output_counter = 0
all_out_f = []
all_out_r = []

# Build a list for storing the reference sequences
output_cols = ["PlateName", "IndexPlate", "Well", "ReferenceSequence",
              "InFrameBase", "BpIndStart", "AaIndStart"]
ref_seq_file_array = [None] * len(barcode_df)
wells = ref_seq_file_array.copy()

# Loop over the barcode plate
for barcode_ind, row in enumerate(barcode_df.itertuples(index = False)):
        
    # Create a well with specific content
    well = WellContents(ref_length = ref_length, read_length = read_length)   
    wells[barcode_ind] = well
    
    # Populate the reference sequence row
    ref_seq_file_array[barcode_ind] = [row.IndexPlate + "_TEST", row.IndexPlate, row.Well,
                                      well.ref_seq, well.in_frame + 1, 1232, 23]
    
    # Define the allowed range of quality scores
    high_q_range = np.arange(bp_q_cutoff, 40)

    # Decide on the number of sequences in the well
    n_seqs_in_well = np.random.randint(*allowed_n_reads)

    # Get the number of times each sequence will be represented
    n_reps = n_seqs_in_well // well.n_variants
    remainder = n_seqs_in_well % well.n_variants
    total_counts_by_seq = np.full(well.n_variants, n_reps)
    total_counts_by_seq[0] += remainder

    # Loop over the sequences and build both the expected output
    # and the desired input
    output_records_f = [None] * n_seqs_in_well
    output_records_r = output_records_f.copy()
    temp_counter = 0
    for total_counts, mutant_seq, mutant_aa_seq in zip(total_counts_by_seq, well.mutant_seqs, well.mutant_aas):
        
        # Build outputs
        for _ in range(total_counts):

            # Pull the forward sequence
            forward_seq_unfinished = mutant_seq[:well.forward_readlength]
            rev_seq_unfinished = Seq(mutant_seq[(well.ref_length - well.reverse_readlength):])
            rev_seq_unfinished = str(rev_seq_unfinished.reverse_complement())

            # Finish the sequences
            finished_f = Seq("".join((row.FBC, F_ADAPTER, forward_seq_unfinished)))
            finished_r = Seq("".join((row.RBC, R_ADAPTER, rev_seq_unfinished)))

            # Create sequence records
            output_records_f[temp_counter] = SeqRecord(finished_f,
                                                       id = id_template.substitute(num = output_counter),
                                                       letter_annotations = {"phred_quality": np.random.choice(high_q_range, len(finished_f))})
            output_records_r[temp_counter] = SeqRecord(finished_r,
                                                       id = id_template.substitute(num = output_counter),
                                                       letter_annotations = {"phred_quality": np.random.choice(high_q_range, len(finished_r))})

            # Update counters
            output_counter += 1 
            temp_counter += 1
            
    # Extend the overall records
    all_out_f.extend(output_records_f)
    all_out_r.extend(output_records_r)            
    
# Save results after shuffling order of output records
shuffle(all_out_f)
shuffle(all_out_r)
with open("deSeqValidation/Test_R1_001.fastq", "w") as f:
    SeqIO.write(all_out_f, f, "fastq")
with open("deSeqValidation/Test_R2_001.fastq", "w") as f:
    SeqIO.write(all_out_r, f, "fastq")
output_df = pd.DataFrame(ref_seq_file_array, columns = output_cols)
output_df.to_csv("./deSeqValidation/TestRefSeq.csv", index = False)

In [22]:
len(wells[6].codon_mutation_positions)

32

In [None]:
w

In [8]:
f_bc, r_bc = barcode_df.loc[(barcode_df.IndexPlate == "DI01")&(barcode_df.Well == "A01"), ["FBC", "RBC"]].values.flatten()
id_counter = 0


    
#     # Grab the mutated bases
#     mutant_bases = [[base_ind, orig_base, new_base] for 
#                     base_ind, (orig_base, new_base) in
#                     enumerate(zip(test.ref_seq, mutant_seq))
#                    if orig_base != new_base]
    
#     # Grab the mutated amino acids
#     mutant_aas = [[aa_ind, orig_aa, new_aa] for 
#                  aa_ind, (orig_aa, new_aa) in
#                  enumerate(zip(test.aa_ref_seq, mutant_aa_seq))
#                  if orig_aa != new_aa]
    
#     # Grab the expected counts for the bases
#     expected_base_counts = [test.perfect_counts[mutant_ind] for mutant_ind, _, _ in mutant_bases]
    
#     Get the expected counts


In [9]:
len(output_records_f[0].letter_annotations["phred_quality"])

150

In [10]:
output_records_f

[SeqRecord(seq=Seq('CCTAATCCACCCAAGACCACTCTCCGGAAATCCCGTTTAGGCAGACGGAGACCA...TTC'), id='@M06418:33:000000000-CRL6Y:1:1101:0:38350 1:N:0:162', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('CCTAATCCACCCAAGACCACTCTCCGGAAATCCCGTTTAGGCAGACGGAGACCA...TTC'), id='@M06418:33:000000000-CRL6Y:1:1101:0:38351 1:N:0:162', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('CCTAATCCACCCAAGACCACTCTCCGGAAATCCCGTTTAGGCAGACGGAGACCA...TTC'), id='@M06418:33:000000000-CRL6Y:1:1101:0:38352 1:N:0:162', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('CCTAATCCACCCAAGACCACTCTCCGGAAATCCCGTTTAGGCAGACGGAGACCA...TTC'), id='@M06418:33:000000000-CRL6Y:1:1101:0:38353 1:N:0:162', name='<unknown name>', description='<unknown description>', dbxrefs=[]),
 SeqRecord(seq=Seq('CCTAATCCACCCAAGACCACTCTCCGGAAATCCCGTTTAGGCAGACGGAGACCA...TTC'), id='@M06418:33:000000000-CRL6Y:1:1101:0:38354 1:N:0:162'

In [11]:

# Loop over each variable sequence


# Create a counter for the number of sequences in a well



# Determine codons with poor quality bases that we mutated.
# bad_base_test = np.random.rand(len(test.codon_mutation_positions)) < bad_base_freq
# codons_with_bad_base = test.codon_mutation_positions[bad_base_test]


# Determine the bad bases for the variable each sequence.
# bad_base_checks = np.random.rand(n_seqs_in_well, test.ref_length) < bad_base_freq

# Decide if there are any bad bases in the sequences
# bad_base_checker = np.random.rand(n_seqs_in_well) < bad_base_in_seq_freq

# Loop over the bad base checker and decide (a) the number of bad
# positions and (b) where those positions are. 

# After sequences are split, decide if we want to tank just one of them (e.g.
# by adding insertions or deletions or dropping the q-score very low)


In [12]:
n_seqs_in_well

46

In [13]:
n_reps

23

In [14]:
total_counts_by_seq

array([23, 23])

In [15]:
codons_with_bad_base

NameError: name 'codons_with_bad_base' is not defined

In [None]:
# Get the overall number of reads that we want for the well
total_possible_reads = np.random.randint(*allowed_n_reads)

# Decide if the well is dead. Evaluate possible conditions to make it dead.
if random.rand() < dead_well_freq:
    
    # Choose "1", "2", or "3". "1" means make no usable reads (either by insertion, deletion
    # or low average read quality)

# Decide if we want this to be wild type 



In [None]:
# Make a kill reads function that either makes an insertion/deletion
# or else just wipes out functionality

In [None]:
# Now we need a function that...
# 1. Determines the overall count of each read
# 2. Assigns a target frequency for each position
# 3. Assigns a target count for each position
# 4. Breaks into forward and reverse reads
# 5. Loops over reads and assigns quality scores to the different bases
#

In [None]:
build_reference_and_variants()