This notebook serves as a development ground for the new alignment protocol. Note that the clustal omega version being used was downloaded on 10/26/2020 from "http://www.clustal.org/omega/" and was renamed to "clustalo" from the binary file clustalo-1.2.4-Ubuntu-x86_64.

In [1]:
# Load necessary modules
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import pairwise2
from tqdm import tqdm
import numpy as np
import scipy.stats as ss
import pandas as pd
import warnings
import subprocess
import os

In [9]:
BARCODE_LENGTH = 7
ADAPTER_LENGTH_F = 20
ADAPTER_LENGTH_R = 19

# Redefine the BioPython aligment function so that we can quicky change
# parameters later
def deseq_align(reference, query):
    
    # Redefine biopython aligment function (this is just for code neatness)
    return pairwise2.align.globalxs(reference, query, open = -2, extend = -1,
                                    one_alignment_only=True, penalize_end_gaps = False)[0]
    

# Define an object that holds BioPython SeqRecords
class SeqPair():
    
    # Record that we don't have forward or reverse information yet. Record that
    # we don't have alignment information yet
    def __init__(self):
        
        # Did sequence pass QC?
        self._use_f = False
        self._use_r = False
        
        # Did alignments pass QC?
        self._use_f_alignment = False
        self._use_r_alignment = False
            
    # Assign forward reads
    def assign_f(self, f_record):
        
        # Build summary stats
        self._f_barcode, self._f_len, self._f_average_q = self.calculate_read_stats(f_record)
                
        # Assign the forward barcode and the adapterless sequence
        self._f_adapterless = f_record[(BARCODE_LENGTH + ADAPTER_LENGTH_F):]
        
        # Note that we have forward information
        self._use_f = True
    
    # Assign a paired reverse read
    def assign_r(self, r_record):
        
        # Build summary stats
        self._r_barcode, self._r_len, self._r_average_q = self.calculate_read_stats(r_record)
        
        # Assign the reverse barcode and adapterless sequence. 
        # We want to have the reverse complement of this sequence to match
        # the reference sequence
        self._sliced_r = r_record[(BARCODE_LENGTH + ADAPTER_LENGTH_R):]
        self._r_adapterless = self.sliced_r.reverse_complement(id = True, description = True)
        
        # Note that we have reverse information
        self._use_r = True
        
    # Calculate summary stats for a read
    def calculate_read_stats(self, record):
        
        # Calculate relevant summary stats needed by both forward and reverse methods
        barcode = str(record.seq)[:BARCODE_LENGTH]
        length = len(record)
        average_quality = np.mean(record.letter_annotations["phred_quality"])
        
        return barcode, length, average_quality
    
    # Write a function that performs QC on reads
    def qc_reads(self, length_filter, q_cutoff):
        
        # Make sure forward reads pass the length and quality cutoff. Only run QC
        # if we actually recorded a forward read.
        if self.use_f:
            if self.f_len < length_filter or self.f_average_q < q_cutoff:
                self._use_f = False
                
        # Make sure reverse reads pass the length and quality cutoff. Only run QC
        # if we actually recorded a forward read.
        if self.use_r:
            if self.r_len < length_filter or self._r_average_q < q_cutoff:
                self._use_r = False
    
    # Align forward and reverse reads to a reference sequence
    def align(self, reference):
        
        # Make a pairwise alignment. Only align the reads we are using.
        if self.is_paired():
            self._f_alignment = deseq_align(reference, self.f_adapterless.seq)
            self._r_alignment = deseq_align(reference, self.r_adapterless.seq)
            
        # If we are only using forward read, handle this here
        elif self.use_f:
            self._f_alignment = deseq_align(reference, self.f_adapterless.seq)
            self._r_alignment = None
            
        # If we are only using reverse read, handle this here
        elif self.use_r:
            self._f_alignment = None
            self._r_alignment = deseq_align(reference, self.r_adapterless.seq)
            
        else:
            raise AssertionError("No reads to align in reference.")
        
    # Write a function that runs QC on an alignment. We automatically discard an alignment
    # with an insertion or deletion. 
    def qc_alignment(self, forward_check = True):
        
        # By default, this is a good alignment. If it fails the qc tests, then it
        # will become a bad alignment
        good_alignment = True
        
        # Pull the appropriate alignment 
        test_alignment = self.f_alignment if forward_check else self.r_alignment
        
        # If the alignment is None, this is a bad alignment (the original sequence
        # failed qc) and we cannot conitnue.
        if test_alignment is None:
            return False, -1
        
        # Pull the reference sequence and alignmed sequence
        refseq = test_alignment.seqA
        aligned_seq = test_alignment.seqB

        # If there are any dashes in the reference sequence, we have a bad alignment
        if "-" in refseq:
            good_alignment = False

        # Get the stripped down aligned sequences
        lstripped = aligned_seq.lstrip("-")
        rstripped = aligned_seq.rstrip("-")
        
        # If this is a forward check, dashes in the middle or to the left of the aligned
        # sequence indicate a deletion or insertion
        if forward_check:

            # Check to see if we have an insertion or deletion
            if len(lstripped) < len(aligned_seq) or "-" in rstripped:
                good_alignment = False

            # Get the first instance of a dash in the full sequence. This indicates the
            # first character after the alignment ends
            first_dash = lstripped.find("-")

            return good_alignment, first_dash

        # If this is a reverse check, dashes in the middle or to the right of the aligned
        # sequence indicate a deletion or insertion
        else:

            # Check to see if we have an insertion or deletion
            if len(rstripped) < len(aligned_seq) or "-" in lstripped:
                good_alignment = False

            # Get the last instance of a dash in the full sequence. This indicates the last
            # character index before the aligned sequence begins.
            last_dash = rstripped.rfind("-")

            return good_alignment, last_dash

    # Write a function that runs QC on a pair of alignments. This will set flags for whether
    # or not an alignment is usable
    def qc_alignments(self):
        
        # Run QC on the forward and reverse alignments
        self._use_f_alignment, self._first_dash = self.qc_alignment(forward_check = True)
        self._use_r_alignment, self._last_dash = self.qc_alignment(forward_check = False)
        
    # Build a composite alignment for paired ends
    def build_paired_composite_alignment(self):
        
        # Both forward and reverse reads must pass aligment qc to enable this 
        assert self.is_paired_post_alignment_qc(), "Cannot build composite from 1 read."
        
        # Grab the reference sequence, the aligned sequences, 
        # and the quality scores
        refseq = self.f_alignment.seqA
        reflength = len(refseq)
        forward_seq = self.f_alignment.seqB
        reverse_seq = self.r_alignment.seqB
        forward_qual = np.array(self.f_adapterless.letter_annotations["phred_quality"])
        reverse_qual = np.array(self.r_adapterless.letter_annotations["phred_quality"])

        # Get the end of the f read. If it goes all the way to the end of the reference
        # sequence, then the first non-f character is the length of the sequence
        post_forward_dash_ind = reflength if self.first_dash == -1 else self.first_dash
        last_forward_char_ind = post_forward_dash_ind - 1

        # Get the beginning of the r read. If it starts from the beginning of the ference
        # sequence, then the first non-r character is -1, so we don't actually need to
        # make any adjustments
        pre_reverse_dash_ind = self.last_dash
        first_r_char_ind = pre_reverse_dash_ind + 1

        # See if the forward and reverse overlap. If they don't overlap. Then the composite
        # is just the forward DNA + dashes + reverse DNA
        if post_forward_dash_ind <= pre_reverse_dash_ind:

            # Calculate the number of dashes needed
            n_dashes = pre_reverse_dash_indr - post_forward_dash_ind + 1

            # Build the composite sequence between the two
            composite_seq = "".join((forward_seq[:post_forward_dash_ind], 
                                   "-" * n_dashes,
                                   reverse_seq[first_r_char_ind:]))

            # Build the composite quality. The quality scores are not extended for the
            # alignment, and so map directly to the pulled sequences.
            composite_qual = np.concatenate((forward_qual,
                                             np.full(n_dashes, np.inf),
                                             reverse_qual))

        # Otherwise, take the sequence with the highest quality in the overlapping region
        else:

            # Pull the forward up to the start of the reverse sequence
            only_f_seq = forward_seq[:first_r_char_ind]
            only_f_qual = forward_qual[:first_r_char_ind]

            # Pull the reverse after the end point of forward. Quality scores only cover 
            # sequence (not alignment gaps), so we need to calculate where the qualities
            # end for the reverse sequence.
            only_r_seq = reverse_seq[post_forward_dash_ind:]
            reverse_qual_break = len(reverse_qual) - len(only_r_seq)
            only_r_qual = reverse_qual[reverse_qual_break:]

            # Now compare the middle parts. Take the one with the higher sequence quality. 
            # The middle characters all fall 
            middle_f_seq = forward_seq[first_r_char_ind:post_forward_dash_ind]
            middle_f_qual = forward_qual[first_r_char_ind:]
            middle_r_seq = reverse_seq[first_r_char_ind:post_forward_dash_ind]
            middle_r_qual = reverse_qual[:reverse_qual_break]

            # The middle sequences should be equal in length (they might differ in sequence
            # due to sequencing errors.  The quality scores should have the same length as well
            middle_size = len(middle_f_seq)
            assert middle_size == len(middle_r_seq)
            assert middle_size == len(middle_f_qual)
            assert middle_size == len(middle_r_qual)

            # Build the composite middle sequence and quality
            middle_seq = [None] * middle_size
            middle_qual = np.zeros(middle_size, dtype = int)
            quality_comparison = np.greater(middle_f_qual, middle_r_qual).astype(int)
            for i in range(middle_size):

                # If the reverse read has better quality, use that
                if quality_comparison[i]:
                    middle_seq[i] = middle_r_seq[i]
                    middle_qual[i] = middle_r_qual[i]

                # If the forward read has better quality, use that
                else:
                    middle_seq[i] = middle_f_seq[i]
                    middle_qual[i] = middle_f_qual[i]

            # Build the overall composite sequence and qualities. 
            composite_seq = "".join((only_f_seq, "".join(middle_seq), only_r_seq))
            composite_qual = np.concatenate((only_f_qual, middle_qual, only_r_qual))
            
        # Check to be sure lengths are correct
        assert reflength == len(composite_seq)
        assert reflength == len(composite_qual)
            
        return composite_seq, composite_qual
    
    # Build a pairwise composite alignment for non-paired ends
    def build_unpaired_composite_alignment(self):
        
        # First make sure that we are calling this function appropriately
        assert not self.is_paired_post_alignment_qc(), "This function only works for unpaired reads"
        
        # Determine if it is forward or reverse reads
        if self.use_f_alignment:
            
            # Get the length of the reference sequence
            refseq = self.f_alignment.seqA
            composite_length = len(refseq)
            
            # The composite sequence is just the aligned sequence
            composite_seq = self.f_alignment.seqB
            
            # The qualities continue after the alignment. Add as many zeros as 
            # there are differences between existing qualities and the end of
            # the sequence
            forward_qual = self.f_adapterless.letter_annotations["phred_quality"]
            composite_qual = np.concatenate((forward_qual, 
                                             np.full(composite_length - len(forward_qual), np.inf)))
            
        else:
            
            # Get the length of the reference sequence
            refseq = self.r_alignment.seqA
            composite_length = len(refseq)
            
            # The composite sequence is just the aligned sequence
            composite_seq = self.r_alignment.seqB
            
            # The qualities must be before the alignment. Prepend as many zeros
            # as there are differences between existing qualities and the end of 
            # the sequence
            reverse_qual = self.r_adapterless.letter_annotations["phred_quality"]
            composite_qual = np.concatenate((np.full(composite_length - len(reverse_qual), np.inf),
                                             reverse_qual))
            
        # Assert that everything is the expected length
        assert composite_length == len(composite_seq)
        assert composite_length == len(composite_qual)
            
        return composite_seq, composite_qual
    
    # Write a function that builds a composite sequence regardless of alignment type
    def build_composite_alignment(self):
        
        # Complicated composite if this is paired end
        if self.is_paired_post_alignment_qc():
            return self.build_paired_composite_alignment()
        
        # Simple composite if this is not paired end
        else:
            return self.build_unpaired_composite_alignment()
        
    # Write a function for extracting information from the alignment
    def analyze_alignment(self, inframe_ind, ref_len, n_aas, qual_thresh):
        
        # Pull the composite alignment for the sequence
        composite_sequence, composite_qual = self.build_composite_alignment()

        # Create matrices in which to store counts
        bp_counts = np.zeros([6, ref_len], dtype = int)
        aa_counts = np.zeros([23, n_aas], dtype = int)

        # Loop over the composite sequence up to the in-frame part
        for base_ind, (bp, qual) in enumerate(zip(composite_sequence[:inframe_ind],
                                                  composite_qual[:inframe_ind])):

            # Only record counts if we meet a quality threshold
            if qual >= qual_thresh:
                bp_counts[BP_TO_IND[bp], base_ind] += 1

        # Initialize variables for holding codon information
        aa_counter = 0
        record_aa = True
        codon = [None] * 3
        codon_counter = 0
        
        # Loop over the remaining sequence that is in frame
        for inframe_counter, (bp, qual) in enumerate(zip(composite_sequence[inframe_ind:],
                                                         composite_qual[inframe_ind:])):

            # Update the base ind (this continues from our previous loop)
            base_ind += 1

            # If the inframe counter is divisible by 3, is not 0 rest the codon counter
            if inframe_counter %3 == 0 and inframe_counter != 0:

                # If all members of the codon passed quality control record
                if record_aa:
                    
                    # Join the characters
                    joined_codon = "".join(codon)
                    
                    # If this is in a gap, record gap
                    if "-" in joined_codon:
                        aa = "-"
                    
                    # If it isn't in the codon table, record question mark
                    elif joined_codon not in CODON_TABLE:
                        aa = "?"
                    
                    else:
                        aa = CODON_TABLE[joined_codon]
                    
                    # Add to counts
                    aa_counts[AA_TO_IND[aa], aa_counter] += 1

                # Reset all codon related variables and increment the aa counter
                aa_counter += 1
                record_aa = True
                codon = [None] * 3
                codon_counter = 0

            # Only record counts if we meet a quality threshold
            if qual >= qual_thresh:
                bp_counts[BP_TO_IND[bp], base_ind] += 1
                codon[codon_counter] = bp

            # If we don't meet a quality threshold, then throw a flag to
            # not record the aa in this codon
            else:
                record_aa = False

            # Increment the codon counter
            codon_counter += 1
            
        # Run a check on the count. A sum across the 0th axis should
        # return all ones and zeros, as we should never count two bases or two
        # amino acids in one position
        bp_test = np.sum(bp_counts, axis = 0)
        aa_test = np.sum(aa_counts, axis = 0)
        assert np.all(np.logical_or(bp_test == 1, bp_test == 0)), "Double counting bases"
        assert np.all(np.logical_or(aa_test == 1, aa_test == 0)), "Double counting amino acids"
            
        # Return the filled out count matrices
        return bp_counts, aa_counts
    
    # Write a function that returns read lengths
    def read_lengths(self):
        if self.is_paired():
            return [self.f_len, self.r_len]
        elif self.use_f:
            return [self.f_len, np.nan]
        elif self.use_r:
            return [np.nan, self.r_len]
        else:
            raise AssertionError("No reads for which to return lengths.")
    
    # Check to see if we are using both sequences
    def is_paired(self):
        if self.use_r and self.use_f:
            return True
        else:
            return False
        
    # Check to see if we have no sequences aligned
    def is_dud(self):
        if not (self.use_r or self.use_f):
            return True
        else:
            return False
        
    # Check to see if both alignments pass QC
    def is_paired_post_alignment_qc(self):
        if self.use_f_alignment and self.use_r_alignment:
            return True
        else:
            return False
        
    # Check to see if we have no alignments that pass
    def is_dud_post_alignment_qc(self):
        if not(self.use_f_alignment or self.use_r_alignment):
            return True
        else:
            return False
        
    # Make all the properties
    @property
    def use_f(self):
        return self._use_f
    
    @property
    def use_r(self):
        return self._use_r
    
    @property
    def use_f_alignment(self):
        return self._use_f_alignment
    
    @property
    def use_r_alignment(self):
        return self._use_r_alignment
        
    @property
    def f_barcode(self):
        return self._f_barcode
    
    @property
    def f_len(self):
        return self._f_len
    
    @property
    def f_average_q(self):
        return self._f_average_q
    
    @property
    def f_adapterless(self):
        return self._f_adapterless
    
    @property
    def r_barcode(self):
        return self._r_barcode
    
    @property
    def r_len(self):
        return self._r_len
    
    @property
    def r_average_q(self):
        return self._r_average_q
    
    @property
    def sliced_r(self):
        return self._sliced_r
    
    @property
    def r_adapterless(self):
        return self._r_adapterless
    
    @property
    def f_alignment(self):
        return self._f_alignment
    
    @property
    def r_alignment(self):
        return self._r_alignment
    
    @property
    def first_dash(self):
        return self._first_dash
    
    @property
    def last_dash(self):
        return self._last_dash

In [10]:
# Write a function for loading and pairing fastq files
def load_fastq(f_loc, r_loc):

    # Create a dictionary that links id to sequence object
    id_to_reads = {}
    print("Loading forward reads...")
    all_f_recs = list(SeqIO.parse(f_loc, "fastq"))
    for f_record in all_f_recs:
        temp_record = SeqPair()
        temp_record.assign_f(f_record)
        id_to_reads[f_record.id] = temp_record
    
    # Associate reverse reads with the forward
    print("Loading reverse reads...")
    all_r_recs = list(SeqIO.parse(r_loc, "fastq"))
    for r_record in all_r_recs:

        # If there is no partern in id_to_reads, create a new object 
        # and continue
        if r_record.id not in id_to_reads:
            temp_record = SeqPair()
            temp_record.assign_r(r_record)
            id_to_reads[r_record.id] = temp_record

        # Otherwise, attach the reverse record
        else:
            id_to_reads[r_record.id].assign_r(r_record)
            
    # Only keep records that have a partner
    return list(id_to_reads.values())

# Load fastq files
all_seqpairs = load_fastq("./TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq",
                          "./TestData/20200205_ssSeq/CHL2_S199_L001_R2_001.fastq")

Loading forward reads...
Loading reverse reads...


In [11]:
# Write a function for filtering out bad seqpairs
def qc_seqpairs(all_seqpairs, read_length = None, length_cutoff = 0.9, q_cutoff = 30):
    
    print("Running read qc...")
    
    # If we don't have the read length determine it
    if read_length is None:

        # Get the most common read length. We will assign this as our read length
        all_readlengths = np.array([seqpair.read_lengths() for seqpair in all_seqpairs])
        read_length = ss.mode(all_readlengths, axis = None, nan_policy = "omit").mode[0]
        
    # Calculate the read filter
    read_filter = read_length * length_cutoff
        
    # Run QC on every read
    for seqpair in all_seqpairs:
        seqpair.qc_reads(read_filter, q_cutoff)
    
    # Eliminate any duds, which are those seqpairs with both a forward and a reverse that failed qc
    no_duds = tuple(filter(lambda x: not x.is_dud(), all_seqpairs))
    
    return no_duds

filtered_seqpairs = qc_seqpairs(all_seqpairs)

Running read qc...


In [12]:
ALLOWED_WELLS = {'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 
                 'A10', 'A11', 'A12', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 
                 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'C01', 'C02', 'C03', 
                 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 
                 'D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 
                 'D10', 'D11', 'D12', 'E01', 'E02', 'E03', 'E04', 'E05', 'E06', 
                 'E07', 'E08', 'E09', 'E10', 'E11', 'E12', 'F01', 'F02', 'F03', 
                 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 
                 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 
                 'G10', 'G11', 'G12', 'H01', 'H02', 'H03', 'H04', 'H05', 'H06', 
                 'H07', 'H08', 'H09', 'H10', 'H11', 'H12'}

BP_TO_IND = {"A": 0, 
            "T": 1,
            "C": 2,
            "G": 3,
            "N": 4,
            "-": 5}

AA_TO_IND = {'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'E': 5, 'Q': 6, 'G': 7, 
             'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 
             'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19, '*': 20, '?': 21, '-': 22}

CODON_TABLE = {'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 
               'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 
               'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 
               'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 
               'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 
               'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 
               'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 
               'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 
               'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 
               'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 
               'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 
               'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 
               'GGG': 'G', 'TAG': '*', 'TAA': '*', 'TGA': '*'}

# Load the index map and reference sequencefile
index_map = pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/ssSeqSupport/IndexMap.csv")
ref_seq_crude = pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/AlignmentDev/TestData/20200205_ssSeq/RefSeqs.csv")

# Expand each reference sequence
updated_ref_array = []
for row in ref_seq_crude.itertuples(index = False):
    updated_ref_array.extend([[row.PlateName, row.IndexPlate, well, row.ReferenceSequence, row.InFrameBase]
                             for well in ALLOWED_WELLS])
    
# Define the complete reference sequence dataframe
complete_ref_seq = pd.DataFrame(updated_ref_array, columns = ("PlateName", "IndexPlate", "Well", "ReferenceSequence", "InFrameBase"))

# Join on plate and well
merged_dfs = complete_ref_seq.merge(index_map, on = ("IndexPlate", "Well"))

# Map barcode to reference sequence, plate, and well
bc_to_ref_plate_well = {(row.FBC, row.RBC): {"IndexPlate": row.IndexPlate,
                                             "PlateNickname": row.PlateName,
                                             "Well": row.Well,
                                             "ReferenceSequence": row.ReferenceSequence,
                                            "InFrameBase": row.InFrameBase}
                       for row in merged_dfs.itertuples(index = False)}



In [16]:
class Well():
    
    # Initialization assigns attributes, reference sequences, and sequence pairs
    def __init__(self, seqpairs, refseq_df_info, save_dir):
        
        # Assign the sequence pairs as an attribute and unpack the refseq info
        self._all_seqpairs = seqpairs
        self._index_plate = refseq_df_info["IndexPlate"]
        self._plate_nickname = refseq_df_info["PlateNickname"]
        self._well = refseq_df_info["Well"]
        self._reference_sequence = refseq_df_info["ReferenceSequence"]
        self._ref_len = len(self.reference_sequence)
        self._in_frame_ind = refseq_df_info["InFrameBase"] - 1 #Input is 1-indexed, so subtract 1
        
        # Generate save locations for alignment files
        self._fasta_loc = os.path.join(save_dir, "ParsedFilteredFastqs")
        self._alignment_loc = os.path.join(save_dir, "Alignments", 
                                       f"{self.index_plate}-{self.well}.txt")
        
        # Get the number of aas in the reference sequence
        self._n_aas = (self.ref_len - self.in_frame_ind) // 3
        
    # Write a function that outputs adapterless fastq files for all paired end seqpairs
    # Note that the reverse complement of 
    def write_fastqs(self):
        
        # Identify the paired end sequence pairs
        paired_end_alignments = tuple(filter(lambda x: x.is_paired_post_alignment_qc(), self.all_seqpairs))
        
        # Build a list of sequences to save
        f_records_to_save = [seqpair.f_adapterless for seqpair in paired_end_alignments]
        r_records_to_save = [seqpair.sliced_r for seqpair in paired_end_alignments]
        assert len(f_records) == len(r_records), "Mismatch in number of paired ends"
            
        # Save the records
        with open(os.path.join(self.fasta_loc, "F", f"{self.index_plate}-{self.well}_R1.fastq"), "w") as f:
            SeqIO.write(f_records_to_save, f, "fastq")
        with open(os.path.join(self.fasta_loc, "R", f"{self.index_plate}-{self.well}_R2.fastq"), "w") as f:
            SeqIO.write(f_records_to_save, f, "fastq")
            
    # Write a function that makes pairwise and runs qc on pairwise alignments and then identifies usable
    # and paired alignments
    def align(self):
        
        # Run alignment on all seqpairs
        for seqpair in self.all_seqpairs:
            seqpair.align(self.reference_sequence)
            seqpair.qc_alignments()
        
        # Identify seqpairs that have at least one read passing alignment QC
        self._non_dud_alignments = tuple(filter(lambda x: not x.is_dud_post_alignment_qc(), self.all_seqpairs))
                
    # Write a function that analyzes alignments to generate count matrices
    def analyze_alignments(self, qual_thresh):

        # Create matrices in which to store counts
        n_non_duds = len(self.non_dud_alignments)
        self._bp_counts = np.zeros([n_non_duds, 6, self.ref_len], dtype = int)
        self._aa_counts = np.zeros([n_non_duds, 23, self.n_aas], dtype = int)
        
        # Loop over all non-dud seqpairs and record counts for each aa and sequence
        for pair_ind, seqpair in enumerate(self.non_dud_alignments):
            self._bp_counts[pair_ind], self._aa_counts[pair_ind] = seqpair.analyze_alignment(self.in_frame_ind, 
                                                                                             self.ref_len,
                                                                                             self.n_aas,
                                                                                             qual_thresh) 
    
    # Write a function to calculate the expected reference amino acid and base sequences
    def calculate_expected_arrays(self):
    
        # Create arrays for storing expected results. 
        expected_bps = np.zeros([6, self.ref_len], dtype = int)
        expected_aas = np.zeros([22, self.n_aas], dtype = int)

        # Loop over the reference sequence and record expected basepairs
        for bp_ind, bp in enumerate(self.reference_sequence):
            expected_bps[BP_TO_IND[bp], bp_ind] += 1

        # Caculate last readable bp for translation
        last_readable_bp = self.in_frame_ind + self.n_aas * 3
        
        # Loop over the codons in the reference sequence and record
        aa_counter = 0
        for chunker in range(self.in_frame_ind, last_readable_bp, 3):

            # Identify the codon and translate
            codon = self.reference_sequence[chunker: chunker + 3]
            expected_aa = "?" if codon not in CODON_TABLE else CODON_TABLE[codon]

            # Record and increment counter
            expected_aas[AA_TO_IND[expected_aa], aa_counter] += 1
            aa_counter += 1
            
        # Make sure we are not double counting and that we are counting everything
        bp_test = np.sum(expected_bps, axis = 0)
        aa_test = np.sum(expected_aas, axis = 0)
        assert np.all(bp_test == 1), "Expected bp calculation is wrong"
        assert np.all(aa_test == 1), "Expected aa calculation is wrong"
            
        return expected_bps, expected_aas
                        
    # Write a function that analyzes the alignment output and identifies variable
    # positions
    def find_variable_positions(self):
        pass
        # Calculate the expected aa and bp frequencies given the reference sequence
#         expected_aa_seq = self.re
        
        
    # Define properties
    @property
    def all_seqpairs(self):
        return self._all_seqpairs
    
    @property
    def index_plate(self):
        return self._index_plate
    
    @property
    def plate_nickname(self):
        return self._plate_nickname
    
    @property
    def well(self):
        return self._well
    
    @property
    def reference_sequence(self):
        return self._reference_sequence
    
    @property
    def ref_len(self):
        return self._ref_len
    
    @property
    def n_aas(self):
        return self._n_aas
    
    @property
    def in_frame_ind(self):
        return self._in_frame_ind
    
    @property
    def fasta_loc(self):
        return self._fasta_loc
    
    @property
    def alignment_loc(self):
        return self._alignment_loc
        
    @property
    def non_dud_alignments(self):
        return self._non_dud_alignments
    
    @property
    def bp_counts(self):
        return self._bp_counts
    
    @property
    def aa_counts(self):
        return self._aa_counts

In [17]:
def assign_seqpairs_to_well(filtered_seqpairs, bc_to_ref_plate_well, savedir):

    # Loop over all seqpairs and assign to wells
    print("Assigning sequences to wells...")
    well_pairs = {}
    for pair in filtered_seqpairs:

        # Grab the well ID and see if it is a real well. Continue
        # if it is not. "Fake" wells are those that result from 
        # sequencing errors
        well_id = (pair.f_barcode, pair.r_barcode)
        if well_id not in bc_to_ref_plate_well:
            continue
        
        # Check to see if we have seen this well already.
        # If we have seen it, append to growing list. If we have not,
        # start a new list
        if well_id in well_pairs:
            well_pairs[well_id].append(pair)
        else:
            well_pairs[well_id] = [pair]
            
    # Now build and return the well objects
    return [Well(pair, bc_to_ref_plate_well[well_id], savedir) 
            for well_id, pair in well_pairs.items()] 

all_wells = assign_seqpairs_to_well(filtered_seqpairs, bc_to_ref_plate_well, "./")

Assigning sequences to wells...


In [18]:
# Choose a single well for testing
testwell = all_wells[0]

# Align
testwell.align()
testwell.analyze_alignments(30)

In [20]:
expected_bp, expected_aa = testwell.calculate_expected_arrays()

In [21]:
testwell.bp_counts.shape

(508, 6, 144)

In [28]:
expected_array = expected_bp
test_array = testwell.bp_counts


# Get the counts for the array
by_unit_counts = test_array.sum(axis=0)

# Now get the total counts of real bases/amino acids. This means that
# gaps are ignored; unknown characters are allowed. We ignore gaps because
# the only gaps present are those at the beginning or end of the sequence;
# any gaps in the middle are not looked at for alignments
by_position_counts = by_unit_counts[:-1].sum(axis=0)

# Calculate frequency. Again, ignore gaps.
by_unit_frequency = by_unit_counts[:-1] / by_position_counts

# Now compare to the expected array
difference_from_expectation_absolute = np.abs(by_unit_frequency - expected_array[:-1])
average_difference_from_expectation = np.sum(difference_from_expectation_absolute, axis = 0)/2

In [31]:
average_difference_from_expectation >= 0.9

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True,  True,  True,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True, False, False, False, False, False, False,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [30]:
testwell.reference_sequence

'GCACTGCAGAAACACTCAGTCGCTATTAGCGCCACGATGGGTCGGCTGNNNNNNGAACGGTATCCCGAAACGNNNAGCTTGNNNGAACTTCCTGAGAGACAGATACACAAGCTTGCGTCGGCCCTGTTGGCCTACGCCCGTAGT'

In [None]:
by_position_counts.shape

In [None]:
by_unit_frequency.shape

In [None]:
by_unit_frequency

In [None]:
by_aa_counts = testwell.aa_counts.sum(axis=0)

In [None]:
by_aa_counts

In [None]:
by_position_frequencies

In [None]:
sum(pair.use_f_alignment and not pair.is_paired_post_alignment_qc() for pair in testwell.all_seqpairs)

In [None]:
sum(pair.use_r_alignment and not pair.is_paired_post_alignment_qc() for pair in testwell.all_seqpairs)

In [None]:
for i, seqpair in enumerate(testwell.non_dud_alignments):
#     try:
    seqpair.analyze_alignment(testwell.in_frame_ind, testwell.ref_len,
                          testwell.n_aas, 30)
#     except:
#         print(i)

In [None]:
def run_deseq(args):
    
    # Run all checks on args
    
    # Build reference sequence stuff
    
    # Parse files
    
    # If just analyzing, stop here
    
    # If just getting filtered fastq, stop here
    
    # If just getting MSAs, stop here
    
    # If getting decoupled alignment results, stop here
    
    # If getting paired alignment results, stop here
    pass