In [1]:
# Imports
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils import MeltingTemp as mt
import pandas as pd

In [2]:
# The rap_probes function depends on the fasta_input function to 
# output an easily blat-able file in addition to the full probes list,
# so fasta_input comes first

def fasta_input(probe_complete, gene_name):
    '''Converts the probe output into a csv convenient for FASTA inputs'''
    # Put the sequences in a list
    bip = [i for i in probe_complete.iloc[:,1]]
    
    # Match the sequences to their names
    for i in range(len(probe_complete)):
        bip.insert(2*i, '>' + probe_complete.iloc[i,0])
        
    # Convert list to a pandas dataframe
    bip_pd = pd.DataFrame({'Probes':bip})
    
    # Name the file
    name = gene_name + '_fasta-input.csv'
    
    # Export
    bip_pd.to_csv(name)
    
    return bip_pd

In [5]:
def rap_probes(file, gene_name, probe_length = 50, coverage  = 1, adaptor = 'CAAGTCA', nmol = '25nm'):
    '''Takes a FASTA imported as a Pandas DataFrame and generates 
    RNA affinity purification probes based on given parameters'''
    # Necessary imports:
    # from Bio.Seq import Seq
    # import pandas as pd
    # from Bio import SeqIO
    # from Bio.SeqUtils import MeltingTemp as mt
    # fasta_input function defined above
    
    # Import the FASTA as a Pandas Dataframe
    ls = SeqIO.read(file, 'fasta').seq
    ls = ls.reverse_complement()
    
    # Generate start indices for each probe
    ind2 = [int(i*(probe_length/coverage)) for i in range(0,int(len(ls)/probe_length*coverage))]
    
    # Add a last probe if more than a quarter probe remains
    if ind2[-1] < int(len(ls) - probe_length/4):
        ind2.append(len(ls)-probe_length)
    
    # Generate the probes
    probes = [ls[i:i+probe_length] for i in ind2]
    
    # Add adaptor to 5' end of probes
    probes = [str(adaptor + i) for i in probes]
    
    # Double check probe lengths
    lengths = [len(i) for i in probes]
    
    # Calculate Tms for probes based on hybridization buffer
    melts = [mt.Tm_NN(i, Na = 500, Tris = 10) for i in probes]
    melts = [round(i,1) for i in melts]
    
    # Make appropriate labels for each probe
    labels = [gene_name + '_' + str(probe_length) + 'mer_' + str(i) for i in range(1, len(probes)+ 1)]
    
    # Add nanomoles to order from IDT
    idt_bulk = [nmol for i in range(len(probes))]
    
    # Put everything in a DataFrame
    probe_complete = pd.DataFrame({
        'Name':labels,
        'Sequence':probes,
        'Length (bp)': lengths,
        'Tm (°C)':melts,
        'Nanomoles':idt_bulk
    })
    
    # Name your file
    name = gene_name + '_' + str(probe_length) + 'mer' + '_Probes.csv'
    
    probe_complete.to_csv(name)
    
    # Create a csv that is compatible with programs that require fasta inputs
    fasta_name = gene_name + '_' + str(probe_length) + 'mer' + '_Probes'
    
    fasta_input(probe_complete, fasta_name)
    
    return probe_complete, name

In [7]:
tsix90mer, filename = rap_probes('MuTsix.fasta.txt','MuTsix', probe_length=83)
print(filename)
print(tsix90mer.iloc[:5,:])

MuTsix_83mer_Probes.csv
             Name                                           Sequence  \
0  MuTsix_83mer_1  CAAGTCATGGAGGTGTTTTTTAGAAAAATATTTATTTAATGTGAGT...   
1  MuTsix_83mer_2  CAAGTCACGGATCCCACTACAGATGACTGTGAGCCACTACGAAGTT...   
2  MuTsix_83mer_3  CAAGTCACTAACCACTGAGCTATCTCTCCAGCCCAGGAACTGAGTT...   
3  MuTsix_83mer_4  CAAGTCATGTCCCCGAAACCTGACATACCTTGATGTACACGGTGTG...   
4  MuTsix_83mer_5  CAAGTCACAGAGACTAAATTTAACTCTTTAATTTGATGATTCTCTC...   

   Length (bp)  Tm (°C) Nanomoles  
0           90     81.0      25nm  
1           90     88.8      25nm  
2           90     84.8      25nm  
3           90     85.6      25nm  
4           90     82.4      25nm  


In [8]:
# Manually input 25 probes at a time into blat. Exclude anything with >25 matches in the genome
# Make a list with all the indices and make them pandas searchable
bad_blats = [1,2,3,7,9,10,18,20,24,28,51]
bad_blats = ['MuTsix_83mer_' + str(i) for i in bad_blats]

print(bad_blats)

['MuTsix_83mer_1', 'MuTsix_83mer_2', 'MuTsix_83mer_3', 'MuTsix_83mer_7', 'MuTsix_83mer_9', 'MuTsix_83mer_10', 'MuTsix_83mer_18', 'MuTsix_83mer_20', 'MuTsix_83mer_24', 'MuTsix_83mer_28', 'MuTsix_83mer_51']


In [9]:
# Get rid of the bad blat probes
for i in bad_blats:
    tsix90mer = tsix90mer[tsix90mer.Name != i]

print(tsix90mer.iloc[:10,:])

               Name                                           Sequence  \
3    MuTsix_83mer_4  CAAGTCATGTCCCCGAAACCTGACATACCTTGATGTACACGGTGTG...   
4    MuTsix_83mer_5  CAAGTCACAGAGACTAAATTTAACTCTTTAATTTGATGATTCTCTC...   
5    MuTsix_83mer_6  CAAGTCACCAGTACCTCGCAAGTTCTAAATTCTCTTTGTACAGCTC...   
7    MuTsix_83mer_8  CAAGTCATGGTGTGCTTGTTACTCTGATCCTGATCGCTCTGTCAAC...   
10  MuTsix_83mer_11  CAAGTCACTCAATAGTAAGATTAGTGAATTGCTGGCACTTTGATCG...   
11  MuTsix_83mer_12  CAAGTCACCTAAAGGGAACTTAGAACAGACTGTGAATTATTTGTCA...   
12  MuTsix_83mer_13  CAAGTCACAGGGTGTCTGATCTCTTTCATGTGGATATTCATAGTTT...   
13  MuTsix_83mer_14  CAAGTCACTGGGGCGAGTAAGATACCAATGAGCTATTATTCCCTCA...   
14  MuTsix_83mer_15  CAAGTCAGAGTAACGTACTTCAGTGCGTTTTTGTCCCAAGGTATGG...   
15  MuTsix_83mer_16  CAAGTCAAACTATGAGCGTAAGCCCACCAAATCGGTCACAACTAAT...   

    Length (bp)  Tm (°C) Nanomoles  
3            90     85.6      25nm  
4            90     82.4      25nm  
5            90     84.3      25nm  
7            90     83.5      25nm  


In [10]:
# Make a file to search for repetitive elements in Repeat Masker and Tandem Repeats Finder
tsix90mer_repeatmask = fasta_input(tsix90mer, 'MuTsix_83mer_Probes_Repeat')

#No repeats found. Proceed to renaming

In [13]:
# Rename probes with final indices
for i in range(len(tsix90mer)):
    tsix90mer.iloc[i,0] = 'MuTsix_83mer_' + str(i+1)
print(tsix90mer.iloc[:5,:])

              Name                                           Sequence  \
3   MuTsix_83mer_1  CAAGTCATGTCCCCGAAACCTGACATACCTTGATGTACACGGTGTG...   
4   MuTsix_83mer_2  CAAGTCACAGAGACTAAATTTAACTCTTTAATTTGATGATTCTCTC...   
5   MuTsix_83mer_3  CAAGTCACCAGTACCTCGCAAGTTCTAAATTCTCTTTGTACAGCTC...   
7   MuTsix_83mer_4  CAAGTCATGGTGTGCTTGTTACTCTGATCCTGATCGCTCTGTCAAC...   
10  MuTsix_83mer_5  CAAGTCACTCAATAGTAAGATTAGTGAATTGCTGGCACTTTGATCG...   

    Length (bp)  Tm (°C) Nanomoles  
3            90     85.6      25nm  
4            90     82.4      25nm  
5            90     84.3      25nm  
7            90     83.5      25nm  
10           90     83.4      25nm  


In [14]:
# Export final probes
tsix90mer.to_csv(filename)