In [None]:
import Bio # That's biopython 
from Bio import SeqIO # To read the sequences from the fasta files 
from Bio.Seq import Seq # To work with sequences as an object 
from Bio.SeqRecord import SeqRecord
from Bio import motifs # motif library
from Bio.Alphabet import IUPAC
# Pandas' data structure libraries 
import pandas as pd 
import numpy as np


In [None]:
#  test sequences  
Seq1 = Seq("ATGCCTATG", IUPAC.IUPACUnambiguousDNA())  
print (Seq1)
Seq2 = Seq("ATGCCCCCCC", IUPAC.IUPACUnambiguousDNA())
print (Seq2)
Seq3 = Seq("ATGCCCCCCGATG", IUPAC.IUPACUnambiguousDNA()) 
print (Seq3)
test_seq = Seq("TATGATGTAGTATAATATAATTATAA", IUPAC.IUPACUnambiguousDNA())
print test_seq

In [None]:
Seq1 = Seq("ATGCCTATG")  
Seq1.alphabet = IUPAC.unambiguous_dna
print (Seq1)

In [None]:
 #Create a new object motif "ATG"
instance = [Seq("ATG")] 
motifATG = motifs.create(instance)
print(motifATG)

In [None]:
#Create a new motif using kozak PFM distribution backround 
with open("PWMs/Kozak_PFM.txt") as handle: 
        KozakMotif = motifs.read(handle, 'pfm')

print (KozakMotif)

In [None]:
# From KozakMotif to PWM motif, pseudocounts = ?  
pwmMotif = KozakMotif.counts.normalize(pseudocounts=0.5)
print(pwmMotif)

In [None]:
# From PwmMotif to pssmMotif using log_odds 

pssmMotif = pwmMotif.log_odds()
print (pssmMotif)
pwmMotif

In [None]:
# Calculate the score for a test sequence 
score = pssmMotif.calculate(test_seq)
print score

In [None]:
# The function that calculates for a given sequence the ATGs, their position, their frames, and their score  
def getpos (sequenceID, sequence, motifATG, pssmMotif):
    index = 0
    position = 0
    frame = 0
    score = 0
    sequence.alphabet = IUPAC.unambiguous_dna
    df = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame','score'])
    for pos, seq in  motifATG.instances.search(sequence):
        index = index + 1 
        ATGNumber = index
        position = pos
        frame = pos % 3
        score = pssmMotif.calculate(sequence)
        #score = 0
        #import pdb; pdb.set_trace()
        df = df.append({'Sequence': sequenceID, # to check 
                'ATGNumber':ATGNumber,
                'position':position,
                'frame':frame,
                'score':score}, ignore_index=True)
               
    #if (index == 1):
     #   print("No other alternative frames!")  
    #frame = -1
    #position = -1
    #ATGNumber = index
    return (df)

In [None]:
#test the getpos function with many sequences 

dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame', 'score'])
for seq_record in SeqIO.parse("Crypto/H99_CDS.fa", "fasta"):
    df = getpos(seq_record.id, seq_record.seq, motifATG, pssmMotif)
    dfresult = dfresult.append(df, ignore_index = True)
   
    dfresult.to_csv ('result.csv', encoding ='utf-8', index = False)

In [None]:
#  calculates for a given sequence the ATGs, their position and their frames
index = 0
position = 0
frame = 0
for pos, seq in  motifATG.instances.search(Seq1):
    index = index +1 
    print("%i %s" % (pos, seq)) 
    ATGNumber = index
    position = pos
    frame = pos % 3
    print ("ATGNumber is %i" % (ATGNumber))
    print ("position is %i" % (position))
    print("frame is %i" % (frame))
            
if (index == 1):
    print("no other frame")  
    frame = -1
    position = -1
    ATGNumber = index

In [None]:
# find the second instance of the motif and its location
index = 0
position = 0
frame = 0
for pos, seq in  motifATG.instances.search(Seq1):
    index = index +1 
    print(index) 
    print("%i %s" % (pos, seq))
    if (index==2):  
        frame = pos % 3 
        position = pos
        break  
if (index == 1):
    print("no frame")  
    frame = -1
    position = -1
 
print("frame is %i" % (frame))
print (("position is %i" % (position)))

In [None]:
# basic motif search 
for pos, seq in  motif1.instances.search(Seq1): 
    print("%i %s" % (pos, seq))

In [None]:
# Parse a fasta file
for seq_record in SeqIO.parse("InputTestSeq.fasta", "fasta"):
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))
    getpos2(seq_record.seq,motifATG,verbose=False)

In [None]:
# Write the output text file that contains the frame and position values 
with open("ATGposTestSeq.txt","w") as output_file :
    output_file.write("Seq\tPos\tFrame\n")
    for seq_record in SeqIO.parse("InputTestSeq.fasta", "fasta"):
        posframe = getpos2(seq_record.seq,motifATG,verbose=False)
        output_file.write(seq_record.id + "\t" + str(posframe[0]) + "\t" + str(posframe[1]) + "\n")

In [None]:
# The function OutputPosFrame takes a Fasta file containing sequences and writes a txt file containing the position of the 
#alternative translation initiation and the frame  
def OutputPosFrame (input_file): 
    for seq_record in SeqIO.parse(input_file, "fasta"):
        getpos2(seq_record.seq,motifATG,verbose=False)
    
    with open("ATGposCandida_albicans_sc5314.Cand_albi_SC5314_V4.cds.all.txt","w") as output_file :
        output_file.write("Seq\tPos\tFrame\n")
        for seq_record in SeqIO.parse(input_file, "fasta"):
            posframe = getpos2(seq_record.seq,motifATG,verbose=False)
            output_file.write(seq_record.id + "\t" + str(posframe[0]) + "\t" + str(posframe[1]) + "\n")
    return (output_file)

In [None]:
OutputPosFrame ("InputTestSeq.fasta")

In [None]:
OutputPosFrame ("Homologus fungal/Candida_albicans_sc5314.Cand_albi_SC5314_V4.cds.all.fa")

In [None]:
# The function OutputPosFrame takes a Fasta file containing sequences and writes a txt file containing the position of the 
#alternative translation initiation and the frame  
def OutputPosFrame (input_file): 
    for seq_record in SeqIO.parse(input_file, "fasta"):
        getpos2(seq_record.seq,motifATG,verbose=False)
    
    with open("ATGposCryptoJEC21Seq.txt","w") as output_file :
        output_file.write("Seq\t\tPos\tFrame\n")
        for seq_record in SeqIO.parse(input_file, "fasta"):
            posframe = getpos2(seq_record.seq,motifATG,verbose=False)
            output_file.write(seq_record.id + "\t" + str(posframe[0]) + "\t" + str(posframe[1]) + "\n")
    return (output_file)

In [None]:
df = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame'])
index = 0
position = 0
frame = 0
for pos, seq in  motifATG.instances.search(Seq3):
    index = index +1
    if (index == 1):
        frame = -1
        position = -1
        ATGNumber = index
    else:
        ATGNumber = index
        position = pos
        frame = pos % 3
    df = df.append({'Sequence': 'Seq',
                'ATGNumber': ATGNumber,
                'position':position,
                'frame': frame}, ignore_index=True)
            
#if (index == 1):
 #   print("No alternative frames!")  
    #frame = -1
    #position = -1
    #ATGNumber = index
 
df

In [None]:
df.to_csv ('result.csv', encoding ='utf-8', index = False)

In [None]:
dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame'])

df1 = getpos(Seq1,motifATG)
print (df1)
dfresult = dfresult.append(df1, ignore_index = True)

df2 = getpos(Seq2,motifATG)
print (df2)

dfresult = dfresult.append(df2, ignore_index = True)

df3 = getpos (Seq3, motifATG)
print (df3)
dfresult = dfresult.append(df3, ignore_index = True) 


dfresult


#df1 = df1.append(df2, ignore_index = True)

#df1 = df1.append (df3, ignore_index = True)

In [None]:
# getpos function from the fasta file 
dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame', 'score'])
for seq_record in SeqIO.parse("InputTestSeq.fasta", "fasta"):
    df = getpos(seq_record.seq, motifATG)
    dfresult = dfresult.append(df, ignore_index = True)
   
    #finalf.to_csv ('result.csv', encoding ='utf-8', index = False)
dfresult
    
    #tf.to_csv ('result.csv', encoding ='utf-8', index = False)
    #print(seq_record.id)
    #print(repr(seq_record.seq))
    #print(len(seq_record))

In [None]:
# The function that calculates for a given sequence the ATGs, their position and their frames, and their score  
def getpos (sequence, motif):
    index = 0
    position = 0
    frame = 0
    df = pd.DataFrame(columns = ['Sequence','ATGNumber','position','frame','score'])
    for pos, seq in  motifATG.instances.search(sequence):
        index = index + 1 
        ATGNumber = index
        position = pos
        frame = pos % 3
        score = pssmMotif.calculate(sequence)
        #score = 0
        #import pdb; pdb.set_trace()
        df = df.append({'Sequence': sequence, # to check 
                'ATGNumber':ATGNumber,
                'position':position,
                'frame':frame,
                'score':score}, ignore_index=True)
               
    #if (index == 1):
     #   print("No other alternative frames!")  
    #frame = -1
    #position = -1
    #ATGNumber = index
    return (df)