In [1]:
import Bio # That's biopython 
from Bio import SeqIO # To read the sequences from the fasta files 
from Bio.Seq import Seq # To work with sequences as an object 
from Bio.SeqRecord import SeqRecord
from Bio import motifs # motif library
from Bio.Alphabet import IUPAC
# Pandas' data structure libraries 
import pandas as pd 
import numpy as np


In [2]:
#  test sequences  
Seq1 = Seq("ATGCCTATG", IUPAC.IUPACUnambiguousDNA())  
print (Seq1)
Seq2 = Seq("ATGCCCCCCC", IUPAC.IUPACUnambiguousDNA())
print (Seq2)
Seq3 = Seq("ATGCCCCCCGATG", IUPAC.IUPACUnambiguousDNA()) 
print (Seq3)
test_seq = Seq("TATGATGTAGTATAATATAATTATAA", IUPAC.IUPACUnambiguousDNA())
print test_seq
test_Kozak = Seq("CAAACAAAATG", IUPAC.IUPACUnambiguousDNA())
print test_Kozak
test_antiKozak = Seq("GGGGGTTTTCT", IUPAC.IUPACUnambiguousDNA())
print test_antiKozak

ATGCCTATG
ATGCCCCCCC
ATGCCCCCCGATG
TATGATGTAGTATAATATAATTATAA
CAAACAAAATG
GGGGGTTTTCT


In [3]:
# Create a new object motif "ATG"
motifATG = motifs.create( [ Seq("ATG") ] )
print(motifATG)

ATG



In [4]:
#Create a new motif using kozak PFM distribution backround 
with open("PWMs/Kozak_PFM.txt") as handle: 
        KozakMotif = motifs.read(handle, 'pfm')

print (KozakMotif)

TF name	None
Matrix ID	None
Matrix:
        0      1      2      3      4      5      6      7      8      9     10
A:  60.00  90.00  70.00  80.00  80.00 160.00 110.00 110.00 200.00   0.00   0.00
C:  70.00  60.00  50.00  50.00 100.00  10.00  60.00  60.00   0.00   0.00   0.00
G:  10.00  10.00  30.00  20.00   5.00  25.00  15.00  15.00   0.00   0.00 200.00
T:  60.00  40.00  50.00  50.00  15.00   5.00  15.00  15.00   0.00 200.00   0.00





In [5]:
# From KozakMotif to PWM motif, pseudocounts = ?  
pwmKozak = KozakMotif.counts.normalize(pseudocounts=0.5)
print(pwmKozak)

        0      1      2      3      4      5      6      7      8      9     10
A:   0.30   0.45   0.35   0.40   0.40   0.79   0.55   0.55   0.99   0.00   0.00
C:   0.35   0.30   0.25   0.25   0.50   0.05   0.30   0.30   0.00   0.00   0.00
G:   0.05   0.05   0.15   0.10   0.03   0.13   0.08   0.08   0.00   0.00   0.99
T:   0.30   0.20   0.25   0.25   0.08   0.03   0.08   0.08   0.00   0.99   0.00



In [6]:
# From PwmMotif to pssmMotif using log_odds 

pssmKozak = pwmKozak.log_odds()
print (pssmKozak)

        0      1      2      3      4      5      6      7      8      9     10
A:   0.26   0.84   0.48   0.67   0.67   1.67   1.13   1.13   1.99  -6.66  -6.66
C:   0.48   0.26   0.00   0.00   0.99  -2.27   0.26   0.26  -6.66  -6.66  -6.66
G:  -2.27  -2.27  -0.73  -1.30  -3.20  -0.99  -1.70  -1.70  -6.66  -6.66   1.99
T:   0.26  -0.32   0.00   0.00  -1.70  -3.20  -1.70  -1.70  -6.66   1.99  -6.66



In [7]:
# Normalized score function returns value between 0 and 1, like Bioconductor
def calc_norm_score (pssm, seq) :
    # import pdb; pdb.set_trace()
    raw_score = pssm.calculate(seq)
    score = ( raw_score - pssm.min ) / (pssm.max - pssm.min)
    return ( score )

print calc_norm_score( pssmKozak, test_seq ) # should be an array of numbers between 0 and 1
print calc_norm_score( pssmKozak, test_Kozak ) # should be 1
print calc_norm_score( pssmKozak, test_antiKozak ) # should be 0

[0.3798483  0.4352925  0.15832752 0.553093   0.29560044 0.33615193
 0.59644955 0.33142576 0.6381159  0.3224843  0.47655964 0.68698126
 0.52971977 0.35268602 0.66425353 0.30258524]
1.0000000052353402
2.3204725370384962e-08


In [None]:
# The function that calculates for a given sequence the ATGs, their position, their frames, and their score  
def getpos (sequenceID, sequence, motifATG, pssmMotif):
    index = 0
    position = 0
    frame = 0
    score = 0
    sequence.alphabet = IUPAC.unambiguous_dna
    df = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame','score'])
    for pos, seq in  motifATG.instances.search(sequence):
        index = index + 1 
        ATGNumber = index
        position = pos
        frame = pos % 3
        score = pssmMotif.calculate(sequence)
        #score = 0
        #import pdb; pdb.set_trace()
        df = df.append({'Sequence': sequenceID, # to check 
                'ATGNumber':ATGNumber,
                'position':position,
                'frame':frame,
                'score':score}, ignore_index=True)
               
    #if (index == 1):
     #   print("No other alternative frames!")  
    #frame = -1
    #position = -1
    #ATGNumber = index
    return (df)

In [None]:
#test the getpos function with many sequences 

dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame', 'score'])
for seq_record in SeqIO.parse("Crypto/H99_CDS.fa", "fasta"):
    df = getpos(seq_record.id, seq_record.seq, motifATG, pssmMotif)
    dfresult = dfresult.append(df, ignore_index = True)
   
    dfresult.to_csv ('result.csv', encoding ='utf-8', index = False)

In [None]:
#  calculates for a given sequence the ATGs, their position and their frames
index = 0
position = 0
frame = 0
for pos, seq in  motifATG.instances.search(Seq1):
    index = index +1 
    print("%i %s" % (pos, seq)) 
    ATGNumber = index
    position = pos
    frame = pos % 3
    print ("ATGNumber is %i" % (ATGNumber))
    print ("position is %i" % (position))
    print("frame is %i" % (frame))
            
if (index == 1):
    print("no other frame")  
    frame = -1
    position = -1
    ATGNumber = index

In [None]:
# find the second instance of the motif and its location
index = 0
position = 0
frame = 0
for pos, seq in  motifATG.instances.search(Seq1):
    index = index +1 
    print(index) 
    print("%i %s" % (pos, seq))
    if (index==2):  
        frame = pos % 3 
        position = pos
        break  
if (index == 1):
    print("no frame")  
    frame = -1
    position = -1
 
print("frame is %i" % (frame))
print (("position is %i" % (position)))

In [None]:
# basic motif search 
for pos, seq in  motif1.instances.search(Seq1): 
    print("%i %s" % (pos, seq))

In [None]:
# Parse a fasta file
for seq_record in SeqIO.parse("InputTestSeq.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    getpos2(seq_record.seq,motifATG,verbose=False)

In [None]:
# Write the output text file that contains the frame and position values 
with open("ATGposTestSeq.txt","w") as output_file :
    output_file.write("Seq\tPos\tFrame\n")
    for seq_record in SeqIO.parse("InputTestSeq.fasta", "fasta"):
        posframe = getpos2(seq_record.seq,motifATG,verbose=False)
        output_file.write(seq_record.id + "\t" + str(posframe[0]) + "\t" + str(posframe[1]) + "\n")

In [None]:
# The function OutputPosFrame takes a Fasta file containing sequences and writes a txt file containing the position of the 
#alternative translation initiation and the frame  
def OutputPosFrame (input_file): 
    for seq_record in SeqIO.parse(input_file, "fasta"):
        getpos2(seq_record.seq,motifATG,verbose=False)
    
    with open("ATGposCandida_albicans_sc5314.Cand_albi_SC5314_V4.cds.all.txt","w") as output_file :
        output_file.write("Seq\tPos\tFrame\n")
        for seq_record in SeqIO.parse(input_file, "fasta"):
            posframe = getpos2(seq_record.seq,motifATG,verbose=False)
            output_file.write(seq_record.id + "\t" + str(posframe[0]) + "\t" + str(posframe[1]) + "\n")
    return (output_file)

In [None]:
OutputPosFrame ("InputTestSeq.fasta")

In [None]:
OutputPosFrame ("Homologus fungal/Candida_albicans_sc5314.Cand_albi_SC5314_V4.cds.all.fa")

In [None]:
# The function OutputPosFrame takes a Fasta file containing sequences and writes a txt file containing the position of the 
#alternative translation initiation and the frame  
def OutputPosFrame (input_file): 
    for seq_record in SeqIO.parse(input_file, "fasta"):
        getpos2(seq_record.seq,motifATG,verbose=False)
    
    with open("ATGposCryptoJEC21Seq.txt","w") as output_file :
        output_file.write("Seq\t\tPos\tFrame\n")
        for seq_record in SeqIO.parse(input_file, "fasta"):
            posframe = getpos2(seq_record.seq,motifATG,verbose=False)
            output_file.write(seq_record.id + "\t" + str(posframe[0]) + "\t" + str(posframe[1]) + "\n")
    return (output_file)

In [None]:
df = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame'])
index = 0
position = 0
frame = 0
for pos, seq in  motifATG.instances.search(Seq3):
    index = index +1
    if (index == 1):
        frame = -1
        position = -1
        ATGNumber = index
    else:
        ATGNumber = index
        position = pos
        frame = pos % 3
    df = df.append({'Sequence': 'Seq',
                'ATGNumber': ATGNumber,
                'position':position,
                'frame': frame}, ignore_index=True)
            
#if (index == 1):
 #   print("No alternative frames!")  
    #frame = -1
    #position = -1
    #ATGNumber = index
 
df

In [None]:
df.to_csv ('result.csv', encoding ='utf-8', index = False)

In [None]:
dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame'])

df1 = getpos(Seq1,motifATG)
print (df1)
dfresult = dfresult.append(df1, ignore_index = True)

df2 = getpos(Seq2,motifATG)
print (df2)

dfresult = dfresult.append(df2, ignore_index = True)

df3 = getpos (Seq3, motifATG)
print (df3)
dfresult = dfresult.append(df3, ignore_index = True) 


dfresult


#df1 = df1.append(df2, ignore_index = True)

#df1 = df1.append (df3, ignore_index = True)

In [None]:
# getpos function from the fasta file 
dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame', 'score'])
for seq_record in SeqIO.parse("InputTestSeq.fasta", "fasta"):
    df = getpos(seq_record.seq, motifATG)
    dfresult = dfresult.append(df, ignore_index = True)
   
    #finalf.to_csv ('result.csv', encoding ='utf-8', index = False)
dfresult
    
    #tf.to_csv ('result.csv', encoding ='utf-8', index = False)
    #print(seq_record.id)
    #print(repr(seq_record.seq))
    #print(len(seq_record))

In [None]:
# The function that calculates for a given sequence the ATGs, their position and their frames, and their score  
def getpos (sequence, motif):
    index = 0
    position = 0
    frame = 0
    df = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame','score'])
    for pos, seq in  motifATG.instances.search(sequence):
        index = index + 1 
        ATGNumber = index
        position = pos
        frame = pos % 3
        score = pssmMotif.calculate(sequence)
        #score = 0
        #import pdb; pdb.set_trace()
        df = df.append({'Sequence': sequence, # to check 
                'ATGNumber':ATGNumber,
                'position':position,
                'frame':frame,
                'score':score}, ignore_index=True)
               
    #if (index == 1):
     #   print("No other alternative frames!")  
    #frame = -1
    #position = -1
    #ATGNumber = index
    return (df)