In [1]:
# Import needed modules
import pandas as pd
import numpy as np
import csv

In [2]:
# Define a codon table
CodonTable = {'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 
               'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 
               'TGT': 'C', 'TGC': 'C', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 
               'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 
               'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 
               'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 
               'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 
               'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 
               'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 
               'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 
               'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 
               'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 
               'GGG': 'G', 'TAG': '*', 'TAA': '*', 'TGA': '*'}
available_codons = list(CodonTable.keys())

ReverseCompDict = {"A": "T",
                     "T": "A",
                     "C": "G",
                     "G": "C",
                     "N": "N"}

# Write a function that returns the reverse complement of a sequence
def ReverseComplement(seq):

    # Loop through the sequence in reverse and translate
    return "".join(ReverseCompDict[char] for char in reversed(seq))

The original point of this notebook was to create specific datasets for passing into ssSeq with NNN indicating mutation positions, validating ssSeq's effectiveness. The updates in this notebook are to generate a dataset with specified mutations spread throughout the reference sequence. Using this known data set we can then test the ssSeq_tiles_branch for the ability to run without specifying mutation positions. We will be creating data to test the below:

1. Do we get the expected alignment frequencies and counts when passing in imperfect fastq files (i.e. some bases have Q-scores < 30).
2. What happens if we add insertions or deletions to our reads?
3. What happens in all of the above cases in troubleshoot mode vs regular mode?
4. What happens in all of the above cases using a detailed refseq file?

First, generating the reference sequences. ~~Each reference sequence has 4 positions in it. The below types of reference sequence are prepared for 150 bp reads:~~

~~1. 3 in forward read, 1 in reverse.~~
~~2. 2 in forward, 2 in reverse.~~
~~3. 1 in forward, 3 in reverse.~~

For the tile based approach there is only one reference sequence, the parent sequence. I am using P411 C10 as the reference sequence here, looking at tile 1 only.

In [8]:
# Define the generic reference sequence
full_P411_seq = "ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAAATTTACCGTTATTAAACACAGATAAACCGGTTCAAGCTTTGATGAAAATTGCGGATGAATTAGGAGAAATCTTTAAATTCGAGGCGCCTGGTCGTGTAACGCGCTACTTATCAAGTCAGCGTCTAATTAAAGAAGCATGCGATGAATCACGCTTTGATAAAGAGTTAAGTCAAGGTCTGAAATTTCTGCGTGATTTTCTTGGAGACGGGTTAGCCACAAGCTGGACGCATGAAAAAAATTGGAAAAAAGCGCATAATATCTTACTTCCAAGCTTTAGTCAGCAGGCAATGAAAGGCTATCATGCGAGTATGGTCGATATCGCCGTGCAGCTTGTTCAAAAGTGGGAGCGTCTAAATGCAGATGAGCATATTGAAGTATCGGAAGACATGACACGTTTAACGCTTGATACAATTGGTCTTTGCGGCTTTAACTATCGCCTTAACAGCTTTTACCGAGATCAGCCTCATCCATTTATTATAAGTCTGGTCCGTGCACTGGATGAAGTAATGAACAAGCTGCAGCGAGCAAATCCAGACGACCCAGCTTATGATGAAAACAAGCGCCAGTTTCAAGAAGATATCAAGGTGATGAACGACCTAGTAGATAAAATTATTGCAGATCGCAAAGCAAGGGGTGAACAAAGCGATGATTTATTAACGCAGATGCTAAACGGAAAAGATCCAGAAACGGGTGAGCCGCTTGATGACGGGAACATTCGCTATCAAATTATTACATTCTTATATGCGGGAGTTGAAGGTACAAGTGGTCTTTTATCATTTGCGCTGTATTTCTTAGTGAAAAATCCACATGTATTACAAAAAGTAGCAGAAGAAGCAGCACGAGTTCTAGTAGATCCTGTTCCAAGCTACAAACAAGTCAAACAGCTTAAATATGTCGGCATGGTCTTAAACGAAGCGCTGCGCTTATGGCCAACGGTTCCTTATTTTTCCCTATATGCAAAAGAAGATACGGTGCTTGGAGGAGAATATCCTTTAGAAAAAGGCGACGAAGTAATGGTTCTGATTCCTCAGCTTCACCGTGATAAAACAGTTTGGGGAGACGATGTGGAGGAGTTCCGTCCAGAGCGTTTTGAAAATCCAAGTGCGATTCCGCAGCATGCGTTTAAACCGTTTGGAAACGGTCAGCGTGCGTCTCTGGGTCAGCAGTTCGCTCTTCATGAAGCAACGCTGGTACTTGGTATGATGCTAAAACACTTTGACTTTGAAGATCATACAAACTACGAGCTCGATATTAAAGAACTGCAGACGTTAAAACCTAAAGGCTTTGTGGTAAAAGCAAAATCGAAAAAAATTCCGCTTGGCGGTATTCCTTCACCTAGCACTGAACAGTCTGCTAAAAAAGTACGCAAAAAGGCAGAAAACGCTCATAATACGCCGCTGCTTGTGCTATACGGTTCAAATATGGGTACCGCTGAAGGAACGGCGCGTGATTTAGCAGATATTGCAATGAGCAAAGGATTTGCACCGCAGGTCGCAACGCTTGATTCACACGCCGGAAATCTTCCGCGCGAAGGAGCTGTATTAATTGTAACGGCGTCTTATAACGGTCATCCGCCTGATAACGCAAAGCAATTTGTCGACTGGTTAGACCAAGCGTCTGCTGATGAAGTAAAAGGCGTTCGCTACTCCGTATTTGGATGCGGCGATAAAAACTGGGCTACTACGTATCAAAAAGTGCCTGCTTTTATCGATGAAACGCTTGCCGCTAAAGGGGCAGAAAACATCGCTGACCGCGGTGAAGCAGATGCAAGCGACGACTTTGAAGGCACATATGAAGAATGGCGTGAACATATGTGGAGTGACGTAGCAGCCTACTTTAACCTCGACATTGAAAACAGTGAAGATAATAAATCTACTCTTTCACTTCAATTTGTCGACAGCGCCGCGGATATGCCGCTTGCGAAAATGCACGGTGCGTTTTCAACGCTCGAGCACCACCACCACCACCACTGA"

#Define reference sequence as just the first tile (plus overlap regions)
ref_seq = "ATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAAATTTACCGTTATTAAACACAGATAAACCGGTTCAAGCTTTGATGAAAATTGCGGATGAATTAGGAGAAATCTTTAAATTCGAGGCGCCTGGTCGTGTAACGCGCTACTTATCAAGTCAGCGTCTAATTAAAGAAGCATGCGATGAATCACG"
baseline_quality = "".join("@" for _ in range(len(ref_seq)))

# # Define the positions of "NNN" in each of the desired reference sequences
# pos1 = [9, 57, 111, 167]
# pos2 = [57, 111, 168, 201]
# pos3 = [111, 168, 201, 234]
# position_lists = [pos1, pos2, pos3]

# # Build the additional reference sequences
# other_refs = [None for _ in range(3)]
# for i, pos_list in enumerate(position_lists):
#     other_refs[i] = (ref_seq[:pos_list[0]] + "NNN" + ref_seq[pos_list[0] + 3: pos_list[1]] +
#                      "NNN" + ref_seq[pos_list[1] + 3 : pos_list[2]] +
#                      "NNN" + ref_seq[pos_list[2] + 3 : pos_list[3]] + 
#                      "NNN" + ref_seq[pos_list[3] + 3:])




With reference sequences built we can get on to making the fake fastq files. This is done in the below code cells.

array([108, 180, 177,  27, 123,  15,  90,  87,  33, 135])

In [47]:
# Seed
np.random.seed(2)

# First load the barcode sequences.
barcode_df = pd.read_csv("../ssSeqSupport/IndexMap.csv")

# Define the adapter sequences
f_adapter = "CACCCAAGACCACTCTCCGG"
r_adapter = "GGTAGACGGAGACAGGCGG"

# Quality options
quality_opts = [">", "?", "A", "B", "C", "D", "E"]
qual_to_score = {"=": 28, ">": 29, "?": 30, "A": 31,
                 "B": 32, "C": 33, "D": 34, "E": 35}

# No position sets, removing the loop through position sets.


# Now loop over each barcode row
updated_bc_info = [None for _ in range(len(barcode_df))]
for k, (_, row) in enumerate(barcode_df.iterrows()):

    # Select 1 codon at random
    codon_choice = np.random.choice(available_codons, size = 2, replace = False)[0]
    aa_choice = CodonTable[codon_choice]

    #Select 1 amino acid position at random from within the reference sequence
    #Floor division by three and multiplied by three to remain within frame
    aa_mut_position = np.random.choice(len(ref_seq)//3, 1)
    nt_mut_position = aa_mut_position * 3
    
    
    #print(pos_list, codon_choices, aa_choices, aa_mut_position)
    #modify here to choose a random position within the reference sequence
    # Construct all gene sequences
    new_seq = (ref_seq[:nt_mut_position[0]] + codon_choice + ref_seq[nt_mut_position[0] + 3:])

    # Generate actual fake read sequences
    f_seq = row["F-BC"] + f_adapter + new_seq[:123]
    r_seq = row["R-BC"] + r_adapter + ReverseComplement(new_seq[-124:])

    # Generate a number at random which dictates the number of sequences we will see for the row
    n_seqs = np.random.randint(0, 101)

    # Record all information
    updated_bc_info[k] = (row.IndexPlate, row.Well, row["F-BC"], row["R-BC"], n_seqs, 
                          f_seq, r_seq, aa_choice, nt_mut_position[0], aa_mut_position[0])

# removing bc_info_by_pos, looking only at updated_bc_info now
updated_bc_info[:10]

[('DI01',
  'A01',
  'GATCATG',
  'GAACTGC',
  83,
  'GATCATGCACCCAAGACCACTCTCCGGATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAAATTTACCGTTATTAAACACAGATAAACCGGTTCAAGCTTTGATGAAAATTGCGGATGAATTAGGAGAAATCTTT',
  'GAACTGCGGTAGACGGAGACAGGCGGCGTGATTCATCCGTTGCTTCTTTAATTAGACGCTGACTTGATAAGTAGCGCGTTACACGACCAGGCGCCTCGAATTTAAAGATTTCTCCTAATTCATCCGCAATTTTCATCAAAGCTTGAACCG',
  'T',
  186,
  62),
 ('DI01',
  'A02',
  'TACATGG',
  'ACCAGGT',
  63,
  'TACATGGCACCCAAGACCACTCTCCGGATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAAATTTACCGTTATTAAACACAGATAAACCGGTTCAAGCTTTGATGAAAATTGCGGATGAAAAAGGAGAAATCTTT',
  'ACCAGGTGGTAGACGGAGACAGGCGGCGTGATTCATCGCATGCTTCTTTAATTAGACGCTGACTTGATAAGTAGCGCGTTACACGACCAGGCGCCTCGAATTTAAAGATTTCTCCTTTTTCATCCGCAATTTTCATCAAAGCTTGAACCG',
  'K',
  108,
  36),
 ('DI01',
  'A03',
  'AAGCACC',
  'TCTAGAG',
  40,
  'AAGCACCCACCCAAGACCACTCTCCGGATGACAATTAAAGAAATGCCTCAGCCAAAAACGTTTGGAGAGCTTAAAAATTTACCGTTATTAAACACAGATAAACCGGTTCAAGCTTTGATGAAAATTGCGGATGAATGGGGAGAAATCTTT',
  'TCTAGAGGGTAGACGGAGACAG

In [45]:
#Seed
np.random.seed(2)


# Define the forward and reverse fastq file strings
fstring = ""
rstring = ""

# Define a list for holding information about each read
#Adding the position as a part of information and removing additional position mutation
read_list = [["IndexPlate", "Well", "F-BC", "R-BC", "AApos", "NTmutpos", "AAmutpos", "Q1"]]

# Build actual fastq files
#modified to include aa choice and aa position (aa_mut_position) from previous block
for i, (index_plate, well, fbc, rbc, nseqs, fseq, rseq, aachoice, ntpos, aapos) in enumerate(updated_bc_info):

    # Loop over the number of sequences
    for j in range(nseqs):

        # Select 4 sets of quality scores at random
        qual_choice = np.random.choice(quality_opts, size = (1, 3), replace = True)[0]
        qual_string = "".join(qual_choice)

        #print(qual_choice, qual_string)
        
        # Calculate whether all characters in string give a quality above 30/equal to 30 or not        
        greater30 = [qual_to_score[character]>=30 for character in qual_string]

        # Generate quality scores 
        quality_scores = (baseline_quality[:ntpos] + qual_string + 
                          baseline_quality[ntpos + 3:])

        # Generate fake quality scores
        f_qual = "@"*27 + quality_scores[:123]
        r_qual = "@"*26 + quality_scores[-124:][::-1]

        # Build the fastq entries
        uid = "@M06418:33:000000000-CRL6Y:1:1101:{}:{} 1:N:0:162".format(i, j)

        # Add to the forward entry
        if i==0 and j==0:
            fstring += uid
        else:
            fstring += "\n" + uid
        fstring += "\n" + fseq
        fstring += "\n+"
        fstring += "\n" + f_qual

        # Add to the reverse entry
        if i==0 and j==0:
            rstring += uid
        else:
            rstring += "\n" + uid
        rstring += "\n" + rseq
        rstring += "\n+"
        rstring += "\n" + r_qual

        # Append to the read list
        read_list.append([index_plate, well, fbc, rbc, aachoice, ntpos, aapos, *greater30])

# Save everything
with open("P411_Tile1_R1_test.fastq".format(k+1), "w") as f:
    f.write(fstring)
with open("P411_Tile1_R2_test.fastq".format(k+1), "w") as f:
    f.write(rstring)
with open("P411_Tile1_SequenceInfo.csv".format(k+1), "w") as f:
    writer = csv.writer(f)
    writer.writerows(read_list)

Note that on running, as of 01/09/2020, the pos0 and pos4 cannot be run on ssSeq. This is because the software cannot currently handle having no variable regions in either the forward or reverse direction. Pos1 through pos3 were run on 01/09/2020, however, and are analyzed below. The appropriate files for these position sets can be found in "Set1" in the same folder of this Jupyter notebook.

In [6]:
# Load the results file
all_data = pd.DataFrame()
for i in range(1, 9):
    all_data = all_data.append(pd.read_csv("/home/brucejwittmann/GitRepos/ssSeq/ssSeq_Output/20200109-171117/Summaries/TestPlate0{}-0_SummaryInfo.csv".format(i)))
    
# Load the sequence info
sequence_info = pd.read_csv("./Set1/Pos1/Pos1_SequenceInfo.csv")

FileNotFoundError: [Errno 2] File b'/home/brucejwittmann/GitRepos/ssSeq/ssSeq_Output/20200109-171117/Summaries/TestPlate01-0_SummaryInfo.csv' does not exist: b'/home/brucejwittmann/GitRepos/ssSeq/ssSeq_Output/20200109-171117/Summaries/TestPlate01-0_SummaryInfo.csv'

In [46]:
sequence_info

NameError: name 'sequence_info' is not defined

In [None]:
# Get unique plate-well combos
plate_well = tuple(set(tuple(zip(sequence_info.IndexPlate.values, sequence_info.Well.values))))

# Get the expected depth for each site in each well
count_info = []
for plate, well in plate_well:
    
    # Pull all instances where plate is plate and well is well
    subdf = sequence_info.loc[(sequence_info.IndexPlate==plate)&(sequence_info.Well==well), 
                              ["Q1", "Q2", "Q3", "Q4"]]
    AAs = sequence_info.loc[(sequence_info.IndexPlate==plate)&(sequence_info.Well==well), 
                              ["AA1", "AA2", "AA3", "AA4"]].values[0]
    
    # Count the expected read depth per well
    read_depths = subdf.values.sum(axis=0)
    
    # Append to count_info
    sites = [1, 2, 3, 1]
    directions = ["Forward", "Forward", "Forward", "Reverse"]
    for i, count in enumerate(read_depths):
        count_info.append([plate, well, sites[i], directions[i], float(count), AAs[i]])
        
# Convert count_info into a dataframe
count_df = pd.DataFrame(count_info, columns = ["Plate", "Well", "Site", "ReadDirection", "ExpectedDepth", "ExpectedAA"])

In [None]:
# Now merge the count_df onto the all_data df
complete_df = all_data.merge(count_df)

If everything worked as expected, each well should call the appropriate amino acid along with the appropriate read depth. This is checked below:

In [None]:
all(complete_df.ExpectedDepth == complete_df.WellSeqDepth)

In [None]:
all(complete_df.ExpectedAA == complete_df.AA)

For the case of Pos1, we thus are calling both the read depth and amino acid correctly. Now what if we have overlapping positions in forward and reverse. Do we call these correctly?