This notebook serves as a development ground for the new alignment protocol. Note that the clustal omega version being used was downloaded on 10/26/2020 from "http://www.clustal.org/omega/" and was renamed to "clustalo" from the binary file clustalo-1.2.4-Ubuntu-x86_64.

In [1]:
# Load necessary modules
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align.Applications import ClustalOmegaCommandline
from Bio import pairwise2
from tqdm import tqdm
import numpy as np
import scipy.stats as ss
import pandas as pd
import warnings
import subprocess
import os

In [2]:
BARCODE_LENGTH = 7
ADAPTER_LENGTH_F = 20
ADAPTER_LENGTH_R = 19

# Redefine the BioPython aligment function so that we can quicky change
# parameters later
def deseq_align(reference, query):
    
    # Redefine biopython aligment function (this is just for code neatness)
    return pairwise2.align.globalxs(reference, query, open = -2, extend = -1,
                                    one_alignment_only=True, penalize_end_gaps = False)[0]
    

# Define an object that holds BioPython SeqRecords
class SeqPair():
    
    # Record that we don't have forward or reverse information yet. Record that
    # we don't have alignment information yet
    def __init__(self):
        
        # Did sequence pass QC?
        self._use_f = False
        self._use_r = False
        
        # Did alignments pass QC?
        self._use_f_alignment = False
        self._use_r_alignment = False
    
    # Assign forward reads
    def assign_f(self, f_record):
        
        # Build summary stats
        self._f_barcode, self._f_len, self._f_average_q = self.calculate_read_stats(f_record)
                
        # Assign the forward barcode and the adapterless sequence
        self._f_adapterless = f_record[(BARCODE_LENGTH + ADAPTER_LENGTH_F):]
        
        # Note that we have forward information
        self._use_f = True
    
    # Assign a paired reverse read
    def assign_r(self, r_record):
        
        # Build summary stats
        self._r_barcode, self._r_len, self._r_average_q = self.calculate_read_stats(r_record)
        
        # Assign the reverse barcode and adapterless sequence. 
        # We want to have the reverse complement of this sequence to match
        # the reference sequence
        sliced_r = r_record[(BARCODE_LENGTH + ADAPTER_LENGTH_R):]
        self._r_adapterless = sliced_r.reverse_complement(id = True, description = True)
        
        # Note that we have reverse information
        self._use_r = True
        
    # Calculate summary stats for a read
    def calculate_read_stats(self, record):
        
        # Calculate relevant summary stats needed by both forward and reverse methods
        barcode = str(record.seq)[:BARCODE_LENGTH]
        length = len(record)
        average_quality = np.mean(record.letter_annotations["phred_quality"])
        
        return barcode, length, average_quality
    
    # Write a function that performs QC on reads
    def qc_reads(self, length_filter, q_cutoff):
        
        # Make sure forward reads pass the length and quality cutoff. Only run QC
        # if we actually recorded a forward read.
        if self._use_f:
            if self.f_len < length_filter or self.f_average_q < q_cutoff:
                self._use_f = False
                
        # Make sure reverse reads pass the length and quality cutoff. Only run QC
        # if we actually recorded a forward read.
        if self._use_r:
            if self.r_len < length_filter or self._r_average_q < q_cutoff:
                self._use_r = False
    
    # Align forward and reverse reads to a reference sequence
    def align(self, reference):
        
        # Make a pairwise alignment. Only align the reads we are using.
        if is_paired():
            self._f_alignment = deseq_align(reference, self.f_adapterless.seq)
            self._r_alignment = deseq_align(reference, self.r_adapterless.seq)
            
        # If we are only using forward read, handle this here
        elif self.use_f:
            self._f_alignment = deseq_align(reference, self.f_adapterless.seq)
            
        # If we are only using reverse read, handle this here
        elif self.use_r:
            self._r_alignment = deseq_align(reference, self.r_adapterless.seq)
            
        else:
            raise AssertionError("No reads to align in reference.")
        
    # Write a function that runs QC on an alignment. We automatically discard an alignment
    # with an insertion or deletion. 
    def qc_alignments(self):
        pass
        
    # Write a function for extracting information from the alignment
    def analyze_alignment(self):
        pass
    
    # Write a function that returns read lengths
    def read_lengths(self):
        if self.is_paired():
            return [self.f_len, self.r_len]
        elif self.use_f:
            return [self.f_len, np.nan]
        elif self.use_r:
            return [np.nan, self.r_len]
        else:
            raise AssertionError("No reads for which to return lengths.")
    
    # Check to see if we are using both sequences
    def is_paired(self):
        if self.use_r and self.use_f:
            return True
        
    # Check to see if we have no sequences aligned
    def is_dud(self):
        if not (self.use_r or self.use_f):
            return True
        
    # Make all the properties
    @property
    def use_f(self):
        return self._use_f
    
    @property
    def use_r(self):
        return self._use_r
    
    @property
    def use_f_alignment(self):
        return self._use_f_alignment
    
    @property
    def use_r_alignment(self):
        return self._use_r_alignment
    
    @property
    def f_barcode(self):
        return self._f_barcode
    
    @property
    def f_len(self):
        return self._f_len
    
    @property
    def f_average_q(self):
        return self._f_average_q
    
    @property
    def f_adapterless(self):
        return self._f_adapterless
    
    @property
    def r_barcode(self):
        return self._r_barcode
    
    @property
    def r_len(self):
        return self._r_len
    
    @property
    def r_average_q(self):
        return self._r_average_q
    
    @property
    def r_adapterless(self):
        return self._r_adapterless
        
    @property
    def orphan(self):
        return self._orphan
    

In [3]:
# Write a function for loading and pairing fastq files
def load_fastq(f_loc, r_loc):

    # Create a dictionary that links id to sequence object
    id_to_reads = {}
    print("Loading forward reads...")
    all_f_recs = list(SeqIO.parse(f_loc, "fastq"))
    id_to_reads = {f_record.id: SeqPair().assign_f(f_record) for f_record in all_f_recs}
    
    # Associate reverse reads with the forward
    print("Loading reverse reads...")
    all_r_recs = list(SeqIO.parse(r_loc, "fastq"))
    for r_record in all_r_recs:

        # If there is no partern in id_to_reads, continue
        if r_record.id not in id_to_reads:
            continue

        # Otherwise, attach the reverse record
        else:
            id_to_reads[r_record.id].assign_r(r_record)
            
    # Only keep records that have a partner
    return list(id_to_reads.values())

# Load fastq files
all_seqpairs = load_fastq("./TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq",
                          "./TestData/20200205_ssSeq/CHL2_S199_L001_R2_001.fastq")

Loading forward reads...
Loading reverse reads...


In [49]:
test_list = [[1, 3],
            [5, np.nan],
            [0, 3]]
ss.mode(test_list, axis = None, nan_policy = "omit")

ModeResult(mode=array([3.]), count=array([2.]))

In [4]:
# Write a function for filtering out bad seqpairs
def qc_seqpairs(all_seqpairs, read_length = None, length_cutoff = 0.9, q_cutoff = 30):
    
    # If we don't have the read length determine it
    if read_length is None:

        # Get the most common read length. We will assign this as our read length
        all_readlengths = np.array([seqpair.read_lengths() for seqpair in no_orphans])
        read_length = ss.mode(all_readlengths, axis = None, nan_policy = "omit").mode[0]

    # Filter out seqpairs that are too short. Both pairs must pass for this 
    # condition to be met
    length_filter = int(read_length * length_cutoff)
    no_short = list(filter(lambda x: (x.r_len >= length_filter and x.f_len >= length_filter), no_orphans))

    # Filter out pairs that have a bad quality score. These are what will be returned.
    no_bad_q = list(filter(lambda x: (x.r_average_q >= q_cutoff and x.f_average_q >= q_cutoff), no_short))

    # Eliminate any duds, which are those seqpairs with both a forward and a reverse that failed qc
    no_orphans = list(filter(lambda x: not x.is_dud(), all_seqpairs))
    
    return no_bad_q

filtered_seqpairs = filter_bad_seqpairs(all_seqpairs)

In [5]:
ALLOWED_WELLS = {'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 
                 'A10', 'A11', 'A12', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 
                 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'C01', 'C02', 'C03', 
                 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 
                 'D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 
                 'D10', 'D11', 'D12', 'E01', 'E02', 'E03', 'E04', 'E05', 'E06', 
                 'E07', 'E08', 'E09', 'E10', 'E11', 'E12', 'F01', 'F02', 'F03', 
                 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 
                 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 
                 'G10', 'G11', 'G12', 'H01', 'H02', 'H03', 'H04', 'H05', 'H06', 
                 'H07', 'H08', 'H09', 'H10', 'H11', 'H12'}

# Load the index map and reference sequencefile
index_map = pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/ssSeqSupport/IndexMap.csv")
ref_seq_crude = pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/AlignmentDev/TestData/20200205_ssSeq/RefSeqs.csv")

# Expand each reference sequence
updated_ref_array = []
for row in ref_seq_crude.itertuples(index = False):
    updated_ref_array.extend([[row.PlateName, row.IndexPlate, well, row.ReferenceSequence]
                             for well in ALLOWED_WELLS])
    
# Define the complete reference sequence dataframe
complete_ref_seq = pd.DataFrame(updated_ref_array, columns = ("PlateName", "IndexPlate", "Well", "ReferenceSequence"))

# Join on plate and well
merged_dfs = complete_ref_seq.merge(index_map, on = ("IndexPlate", "Well"))

# Map barcode to reference sequence, plate, and well
bc_to_ref_plate_well = {(row.FBC, row.RBC): {"IndexPlate": row.IndexPlate,
                                             "PlateNickname": row.PlateName,
                                             "Well": row.Well,
                                             "ReferenceSequence": row.ReferenceSequence}
                       for row in merged_dfs.itertuples(index = False)}

In [6]:
class Well():
    
    # Initialization assigns attributes, reference sequences, and sequence pairs
    def __init__(self, seqpairs, refseq_df_info, save_dir):
        
        # Assign the sequence pairs as an attribute and unpack the refseq info
        self._seqpairs = seqpairs
        self._index_plate = refseq_df_info["IndexPlate"]
        self._plate_nickname = refseq_df_info["PlateNickname"]
        self._well = refseq_df_info["Well"]
        self._reference_sequence = refseq_df_info["ReferenceSequence"]
        
        # Generate save locations for alignment files
        self._fasta_loc = os.path.join(save_dir, "ParsedFilteredFastas", 
                                       f"{self.index_plate}-{self.well}.fasta")
        self._msa_loc = os.path.join(save_dir, "Msas", 
                                       f"{self.index_plate}-{self.well}.fasta")
        
    # Write a function that outputs fasta files for all of the seqpairs
    def write_fastas(self):
        
        # Write a reference sequence
        reference = SeqRecord(Seq(self.reference_sequence),
                              id = "reference",
                              description = "reference")
        
        # Build a list of sequences to save
        counter = 1
        records_to_save = [None for _ in range(len(self.seqpairs) * 2 + 1)]
        records_to_save[0] = reference
        for seqpair in self.seqpairs:

            # Save the sequnece objects
            records_to_save[counter] = seqpair.f_adapterless
            records_to_save[counter + 1] = seqpair.r_adapterless

            # Update counter
            counter += 2
            
        # Save the records
        with open(self.fasta_loc, "w") as f:
            SeqIO.write(records_to_save, f, "fasta")
    
    # Write a function that uses Clustal Omega to make an MSA of the alignment
    def align_msa(self):
        
        # Run clustal omega as a subprocess
        ccla = ClustalOmegaCommandline(infile = self.fasta_loc, clustersize = len(self.seqpairs),
                                       outfile = self.msa_loc, auto = True, force = True, verbose = True)
        subprocess.run(str(cl_call), shell=True, check=True)
        
    # Write a function that makes pairwise alignments
    def align(self):
        pass
            
    # Write a function that analyzes the alignment output and identifies variable
    # positions
    def find_variable_positions(self, first_in_frame = None):
        
        # Error if first in frame is not provided
        if first_in_frame is None:
            warnings.Warn("Not implemented") 
                  
    # Define properties
    @property
    def seqpairs(self):
        return self._seqpairs
    
    @property
    def index_plate(self):
        return self._index_plate
    
    @property
    def plate_nickname(self):
        return self._plate_nickname
    
    @property
    def well(self):
        return self._well
    
    @property
    def reference_sequence(self):
        return self._reference_sequence
    
    @property
    def fasta_loc(self):
        return self._fasta_loc
    
    @property
    def msa_loc(self):
        return self._msa_loc

In [7]:
def assign_seqpairs_to_well(filtered_seqpairs, bc_to_ref_plate_well, savedir):

    # Loop over all seqpairs and assign to wells
    print("Assigning sequences to wells...")
    well_pairs = {}
    for pair in filtered_seqpairs:

        # Grab the well ID and see if it is a real well. Continue
        # if it is not. "Fake" wells are those that result from 
        # sequencing errors
        well_id = (pair.f_barcode, pair.r_barcode)
        if well_id not in bc_to_ref_plate_well:
            continue
        
        # Check to see if we have seen this well already.
        # If we have seen it, append to growing list. If we have not,
        # start a new list
        if well_id in well_pairs:
            well_pairs[well_id].append(pair)
        else:
            well_pairs[well_id] = [pair]
            
    # Now build and return the well objects
    return [Well(pair, bc_to_ref_plate_well[well_id], savedir) 
            for well_id, pair in well_pairs.items()] 

all_wells = assign_seqpairs_to_well(filtered_seqpairs, bc_to_ref_plate_well, "./")

Assigning sequences to wells...


In [8]:
for well in all_wells:
    well.write_fastas()

In [12]:
# Align
cl_call = ClustalOmegaCommandline(infile = "./ParsedFilteredFastas/DI01-C04.fasta", clustersize = len(all_wells[0].seqpairs),
                                  outfile = "./testAlign.fasta", auto = True, force = True)

In [13]:
%%timeit
subprocess.run(str(cl_call), shell=True, check=True)

3.07 s ± 41.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
all_wells[0].index_plate

'DI01'

In [15]:
all_wells[0].well

'C04'

In [16]:
len(all_wells[0].seqpairs)

505

In [40]:
test = pairwise2.align.globalxs(all_wells[0].reference_sequence, all_wells[0].seqpairs[0].f_adapterless.seq, open = -2, 
                                extend = -1, one_alignment_only=True, penalize_end_gaps = False)[0]
test2 = pairwise2.align.globalxs(all_wells[0].reference_sequence, all_wells[0].seqpairs[0].r_adapterless.seq, open = -2, 
                                extend = -1, one_alignment_only=True, penalize_end_gaps = False)[0]

In [37]:
test._fields

('seqA', 'seqB', 'score', 'start', 'end')

In [38]:
test.seqA

'GCACTGCAGAAACACTCAGTCGCTATTAGCGCCACGATGGGTCGGCTGNNNNNNGAACGGTATCCCGAAACGNNNAGCTTGNNNGAACTTCCTGAGAGACAGATACACAAGCTTGCGTCGGCCCTGTTGGCCTACGCCCGTAGT'

In [39]:
test.seqB

'GCACTGCAGAAACACTCAGTCGCTATTAGCGCCACGATGGGTCGGCTGCCGCCGGAACGGTATCCCGAAACGGTTAGCTTGGGTGAACTTCCTGAGAGACAGATACACAAGCTTGCGTCGGCC---------------------'

In [41]:
test2.seqA

'GCACTGCAGAAACACTCAGTCGCTATTAGCGCCACGATGGGTCGGCTGNNNNNNGAACGGTATCCCGAAACGNNNAGCTTGNNNGAACTTCCTGAGAGACAGATACACAAGCTTGCGTCGGCCCTGTTGGCCTACGCCCGTAGT'

In [42]:
test2.seqB

'-------------------TCGCTATTAGCGCCACGATGGGTCGGCTGCCGCCGGAACGGTATCCCGAAACGGTTAGCTTGGGTGAACTTCCTGAGAGACAGATACACAAGCTTGCGTCGGCCCTGTTGGCCTACGCCCGTAGT'

In [45]:
test.seqB.lstrip("-")
test.seqB.rstrip("-")

'GCACTGCAGAAACACTCAGTCGCTATTAGCGCCACGATGGGTCGGCTGCCGCCGGAACGGTATCCCGAAACGGTTAGCTTGGGTGAACTTCCTGAGAGACAGATACACAAGCTTGCGTCGGCC'

# Look into bowtie!

# Fastest is going to be pairwise align...just rework it

In [None]:
def run_deseq(args):
    
    # Run all checks on args
    
    # Build reference sequence stuff
    
    # Parse files
    
    # If just analyzing, stop here
    
    # If just getting filtered fastq, stop here
    
    # If just getting MSAs, stop here
    
    # If getting decoupled alignment results, stop here
    
    # If getting paired alignment results, stop here
    pass