This notebook serves as a development ground for the new alignment protocol. Note that the clustal omega version being used was downloaded on 10/26/2020 from "http://www.clustal.org/omega/" and was renamed to "clustalo" from the binary file clustalo-1.2.4-Ubuntu-x86_64.

In [26]:
# Load necessary modules
from Bio import SeqIO
from tqdm import tqdm
import pandas as pd
import warnings

In [2]:
BarcodeLength = 7
AdapterLengthF = 27
AdapterLengthR = 26

# Define an object that holds BioPython SeqRecords
class SeqPair():
    
    # Parse forward reads during initialization
    def __init__(self, f_record):
        
        # Assign the forward barcode and the adapterless sequence
        self._f_barcode = str(f_record.seq)[:BarcodeLength]
        self._f_adapterless = f_record[(BarcodeLength + AdapterLengthF):]
        self._f_len = len(f_record)
        
        # Until we have a reverse record paired, this is an orpha
        self._orphan = True
    
    # Assign a paired reverse read
    def assign_partner(self, r_record):
        
        # Assign the reverse barcode and adapterless sequence. 
        # We want to have the reverse complement of this sequence to match
        # the reference sequence
        self._r_barcode = str(r_record.seq)[:BarcodeLength]
        sliced_r = r_record[(BarcodeLength + AdapterLengthR):]
        self._r_adapterless = sliced_r.reverse_complement()
        self._r_len = len(r_record)
        
        # This is no longer an orphan
        self._orphan = False
        
    # Make all the properties
    @property
    def f_barcode(self):
        return self._f_barcode
    
    @property
    def f_adapterless(self):
        return self._f_adapterless
    
    @property
    def f_len(self):
        return self._f_len
    
    @property
    def r_barcode(self):
        return self._r_barcode
    
    @property
    def r_adapterless(self):
        return self._r_adapterless
    
    @property
    def r_len(self):
        return self._r_len
    
    @property
    def orphan(self):
        return self._orphan

In [3]:
# Write a function for loading and pairing fastq files
def load_fastq(f_loc, r_loc):

    # Create a dictionary that links id to sequence object
    id_to_reads = {}
    print("Loading forward reads...")
    all_f_recs = list(SeqIO.parse(f_loc, "fastq"))
    id_to_reads = {f_record.id: SeqPair(f_record) for f_record in all_f_recs}
    
    # Associate reverse reads with the forward
    print("Loading reverse reads...")
    all_r_recs = list(SeqIO.parse(r_loc, "fastq"))
    for r_record in all_r_recs:

        # If there is no partern in id_to_reads, continue
        if r_record.id not in id_to_reads:
            continue

        # Otherwise, attach the reverse record
        else:
            id_to_reads[r_record.id].assign_partner(r_record)
            
    # Only keep records that have a partner
    return tuple([seqpair for seqpair in id_to_reads.values() if not seqpair.orphan])

In [4]:
# Load fastq files
all_seqpairs = load_fastq("./TestData/20200205_ssSeq/CHL2_S199_L001_R1_001.fastq",
                          "./TestData/20200205_ssSeq/CHL2_S199_L001_R2_001.fastq")

Loading forward reads...
Loading reverse reads...


In [25]:
ALLOWED_WELLS = {'A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 
                 'A10', 'A11', 'A12', 'B01', 'B02', 'B03', 'B04', 'B05', 'B06', 
                 'B07', 'B08', 'B09', 'B10', 'B11', 'B12', 'C01', 'C02', 'C03', 
                 'C04', 'C05', 'C06', 'C07', 'C08', 'C09', 'C10', 'C11', 'C12', 
                 'D01', 'D02', 'D03', 'D04', 'D05', 'D06', 'D07', 'D08', 'D09', 
                 'D10', 'D11', 'D12', 'E01', 'E02', 'E03', 'E04', 'E05', 'E06', 
                 'E07', 'E08', 'E09', 'E10', 'E11', 'E12', 'F01', 'F02', 'F03', 
                 'F04', 'F05', 'F06', 'F07', 'F08', 'F09', 'F10', 'F11', 'F12', 
                 'G01', 'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 
                 'G10', 'G11', 'G12', 'H01', 'H02', 'H03', 'H04', 'H05', 'H06', 
                 'H07', 'H08', 'H09', 'H10', 'H11', 'H12'}

# Load the index map and reference sequencefile
index_map = pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/ssSeqSupport/IndexMap.csv")
ref_seq_crude = pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/AlignmentDev/TestData/20200205_ssSeq/RefSeqs.csv")

# Expand each reference sequence
updated_ref_array = []
for row in ref_seq_crude.itertuples(index = False):
    updated_ref_array.extend([[row.PlateName, row.IndexPlate, well, row.ReferenceSequence]
                             for well in ALLOWED_WELLS])
    
# Define the complete reference sequence dataframe
complete_ref_seq = pd.DataFrame(updated_ref_array, columns = ("PlateName", "IndexPlate", "Well", "ReferenceSequence"))

# Join on plate and well
merged_dfs = complete_ref_seq.merge(index_map, on = ("IndexPlate", "Well"))

# Map barcode to reference sequence, plate, and well
bc_to_ref_plate_well = {(row.FBC, row.RBC): {"IndexPlate": row.IndexPlate,
                                             "PlateNickname": row.PlateName,
                                             "Well": row.Well,
                                             "ReferenceSequence": row.ReferenceSequence}
                       for row in merged_dfs.itertuples(index = False)}

In [None]:
class Well():
    
    # Initialization assigns attributes, reference sequences, and sequence pairs
    def __init__(self, seqpairs, refseq_df_info):
        
        # Assign the sequence pairs as an attribute and unpack the refseq info
        self._seqpairs = seqpairs
        self._index_plate = refseq_df_info["IndexPlate"]
        self._plate_nickname = refseq_df_info["PlateNickname"]
        self._well = refseq_df_info["Well"]
        self._reference_sequence = refseq_df_info["ReferenceSequence"]
        
    # Write a function that outputs fasta files for all of the seqpairs
    def write_fastas(self):
        pass
    
    # Write a function that uses Clustal Omega to make an MSA of the alignment
    def align(self):
        pass
    
    # Write a function that analyzes the alignment output and identifies variable
    # positions
    def find_variable_positions(self, first_in_frame = None):
        
        # Error if first in frame is not provided
        if first_in_frame is None:
            raise 
        
    # Define properties
    @property
    def seqpairs(self):
        return self._seqpairs
    
    @property
    def index_plate(self):
        return self._index_plate
    
    @property
    def plate_nickname(self):
        return self._plate_nickname
    
    @property
    def well(self):
        return self._well
    
    @property
    def reference_sequence(self):
        return self._reference_sequence

In [24]:
def assign_seqpairs_to_well(all_seqpairs, bc_to_ref_plate_well):

    # Loop over all seqpairs and assign to wells
    print("Assigning sequences to wells...")
    well_pairs = {}
    for pair in all_seqpairs:

        # Grab the well ID and see if it is a real well. Continue
        # if it is not. "Fake" wells are those that result from 
        # sequencing errors
        well_id = (pair.f_barcode, pair.r_barcode)
        if well_id not in bc_to_ref_plate_well:
            continue
        
        # Check to see if we have seen this well already.
        # If we have seen it, append to growing list. If we have not,
        # start a new list
        if well_id in well_pairs:
            well_pairs[well_id].append(pair)
        else:
            well_pairs[well_id] = [pair]
            
    # Now build the well object
            
 