In [1]:
import Bio # That's biopython 
from Bio import SeqIO # To read the sequences from the fasta files 
from Bio.Seq import Seq # To work with sequences as an object 
from Bio.SeqRecord import SeqRecord
from Bio import motifs # motif library
from Bio.Alphabet import IUPAC
# Pandas' data structure libraries
import pandas as pd 
import numpy as np
import pdb
from math import isclose

In [2]:
#  test sequences  
Seq1 = Seq("ATGCCTATG", IUPAC.IUPACUnambiguousDNA())  
print (Seq1)
Seq2 = Seq("ATGCCCCCCC", IUPAC.IUPACUnambiguousDNA())
print (Seq2)
Seq3 = Seq("ATGCCCCCCGATG", IUPAC.IUPACUnambiguousDNA()) 
print (Seq3)
test_seq = Seq("TATGATGTAGTATAATATAATTATAA", IUPAC.IUPACUnambiguousDNA())
print (test_seq)
test_Kozak = Seq("CAAACAAAATG", IUPAC.IUPACUnambiguousDNA())
print (test_Kozak)
test_antiKozak = Seq("GGGGGTTTTCT", IUPAC.IUPACUnambiguousDNA())
print (test_antiKozak)

ATGCCTATG
ATGCCCCCCC
ATGCCCCCCGATG
TATGATGTAGTATAATATAATTATAA
CAAACAAAATG
GGGGGTTTTCT


In [5]:
# Create a new object motif "ATG"
motifATG = motifs.create( [ Seq("ATG") ] )
print(motifATG)

ATG



In [4]:
#Create a new motif using kozak PFM distribution backround 
with open("PWMs/Kozak_PFM.txt") as handle: 
        KozakMotif = motifs.read(handle, 'pfm')

print (KozakMotif)

TF name	None
Matrix ID	None
Matrix:
        0      1      2      3      4      5      6      7      8      9     10
A:  60.00  90.00  70.00  80.00  80.00 160.00 110.00 110.00 200.00   0.00   0.00
C:  70.00  60.00  50.00  50.00 100.00  10.00  60.00  60.00   0.00   0.00   0.00
G:  10.00  10.00  30.00  20.00   5.00  25.00  15.00  15.00   0.00   0.00 200.00
T:  60.00  40.00  50.00  50.00  15.00   5.00  15.00  15.00   0.00 200.00   0.00





In [6]:
# From KozakMotif to PWM matrix for motif, pseudocounts = ?  
pwmKozak = KozakMotif.counts.normalize(pseudocounts=0.5)
print(pwmKozak)

        0      1      2      3      4      5      6      7      8      9     10
A:   0.30   0.45   0.35   0.40   0.40   0.79   0.55   0.55   0.99   0.00   0.00
C:   0.35   0.30   0.25   0.25   0.50   0.05   0.30   0.30   0.00   0.00   0.00
G:   0.05   0.05   0.15   0.10   0.03   0.13   0.08   0.08   0.00   0.00   0.99
T:   0.30   0.20   0.25   0.25   0.08   0.03   0.08   0.08   0.00   0.99   0.00



In [7]:
# From PWM matrix  to pssm matrix for Motif using log_odds 
pssmKozak = pwmKozak.log_odds()
print (pssmKozak)

        0      1      2      3      4      5      6      7      8      9     10
A:   0.26   0.84   0.48   0.67   0.67   1.67   1.13   1.13   1.99  -6.66  -6.66
C:   0.48   0.26   0.00   0.00   0.99  -2.27   0.26   0.26  -6.66  -6.66  -6.66
G:  -2.27  -2.27  -0.73  -1.30  -3.20  -0.99  -1.70  -1.70  -6.66  -6.66   1.99
T:   0.26  -0.32   0.00   0.00  -1.70  -3.20  -1.70  -1.70  -6.66   1.99  -6.66



In [8]:
def get_pos_frame (sequenceID, sequence, motif):
    # Calculate for a given sequence the ATGs, their position and frames 
    index = 0
    position = 0
    frame = 0
    score = 0
    sequence.alphabet = IUPAC.unambiguous_dna
    df = pd.DataFrame(columns = ['Sequence','ATGNumber','Position','Frame'])
    for pos, seq in  motif.instances.search(sequence):
        index = index + 1 
        ATGNumber = index
        position = pos
        frame = pos % 3
        #import pdb; pdb.set_trace()
        df = df.append({'Sequence': sequenceID,
                'ATGNumber':ATGNumber,
                'Position':position,
                'Frame':frame}, ignore_index=True)
    return (df)

In [9]:
# test get_pos_frame with one sequence at a time
print (get_pos_frame("Seq1",Seq1,motifATG))
print (get_pos_frame("Seq2",Seq2,motifATG))
print (get_pos_frame("Seq3",Seq3,motifATG))
print (get_pos_frame("test_seq",test_seq,motifATG))

  Sequence ATGNumber Position Frame
0     Seq1         1        0     0
1     Seq1         2        6     0
  Sequence ATGNumber Position Frame
0     Seq2         1        0     0
  Sequence ATGNumber Position Frame
0     Seq3         1        0     0
1     Seq3         2       10     1
   Sequence ATGNumber Position Frame
0  test_seq         1        1     1
1  test_seq         2        4     1


In [10]:
# The function OutputPosFrame takes a Fasta file containing sequences and writes a txt file containing the position of the 
#alternative translation initiation and the frame  
def output_pos_frame (input_file, output_file, motif=motifATG): 
    dfresult = pd.DataFrame(columns = ['Sequence','ATGNumber','Position','Frame'])
    for seq_record in SeqIO.parse(input_file, "fasta"):
        dfbit = get_pos_frame(seq_record.id, seq_record.seq, motif)
        dfresult = dfresult.append(dfbit, ignore_index = True)
    dfresult.to_csv (output_file, encoding ='utf-8', index = False)
    return ( dfresult )

In [11]:
# Normalized score function returns value between 0 and 1, like Bioconductor
def calc_norm_score (pssm, seq) :
    #import pdb; pdb.set_trace()
    raw_score = pssm.calculate(seq)
    score = ( raw_score - pssm.min ) / (pssm.max - pssm.min)
    return ( score )

print (calc_norm_score( pssmKozak, test_seq )) # should be an array of numbers between 0 and 1
print (calc_norm_score( pssmKozak, test_Kozak )) # should be 1
print (calc_norm_score( pssmKozak, test_antiKozak )) # should be 0

[0.3798483  0.4352925  0.15832752 0.553093   0.29560044 0.33615193
 0.59644955 0.33142576 0.6381159  0.3224843  0.47655964 0.68698126
 0.52971977 0.35268602 0.66425353 0.30258524]
1.0000000052353402
2.3204725370384962e-08


In [12]:
# Calculate the score for each position with the test sequece 
s = calc_norm_score( pssmKozak, test_seq )
df = pd.DataFrame(columns = ['Sequence', 'Position','Score'])
position = 0
for x in np.nditer(s):
    df = df.append({'Sequence': "test_seq", 
                    'Position': position,
                    'Score': x}, ignore_index=True)
    position = position + 1 
print (df)

    Sequence Position       Score
0   test_seq        0   0.3798483
1   test_seq        1   0.4352925
2   test_seq        2  0.15832752
3   test_seq        3    0.553093
4   test_seq        4  0.29560044
5   test_seq        5  0.33615193
6   test_seq        6  0.59644955
7   test_seq        7  0.33142576
8   test_seq        8   0.6381159
9   test_seq        9   0.3224843
10  test_seq       10  0.47655964
11  test_seq       11  0.68698126
12  test_seq       12  0.52971977
13  test_seq       13  0.35268602
14  test_seq       14  0.66425353
15  test_seq       15  0.30258524


In [13]:
# Function to calculate the score for each position with the test sequece 
def get_motif_score (sequenceID, sequence, motif):
    s = calc_norm_score( pssmKozak, test_seq )
    df = pd.DataFrame(columns = ['Sequence', 'Position','Score'])
    position = 0
    for x in np.nditer(s):
        df = df.append({'Sequence': "test_seq", 
                    'Position': position,
                    'Score': x}, ignore_index=True)
        position = position + 1 
    return (df)

In [14]:
get_motif_score ("test_seq", test_seq, pssmKozak)

Unnamed: 0,Sequence,Position,Score
0,test_seq,0,0.3798483
1,test_seq,1,0.4352925
2,test_seq,2,0.15832752
3,test_seq,3,0.553093
4,test_seq,4,0.29560044
5,test_seq,5,0.33615193
6,test_seq,6,0.59644955
7,test_seq,7,0.33142576
8,test_seq,8,0.6381159
9,test_seq,9,0.3224843


In [15]:
# Function to calculate the score for each position in each sequence of a fasta file and output the df  
def output_motif_score (motif, input_file, output_file):
    df = pd.DataFrame(columns = ['Sequence', 'Position','Score'])
    for seq_record in SeqIO.parse(input_file, "fasta"):
        seq_record.seq.alphabet = IUPAC.unambiguous_dna
        # score at all positions
        s = calc_norm_score(motif, seq_record.seq)
        position = 0
        for x in np.nditer(s):
            df = df.append({'Sequence': seq_record.id, 
                            'Position': position,
                            'Score': x}, ignore_index=True)
            position = position + 1 
        df.to_csv (output_file, encoding ='utf-8', index = False)
    return (df)

In [16]:
# AssertAlmostEqual 
def AssertAlmostEqual(sequence, value):
    score = calc_norm_score(pssmKozak, sequence)
    alequal = isclose(score, value, abs_tol=1e-8)
    return (alequal)

In [14]:
AssertAlmostEqual(test_antiKozak, 0)

False

In [17]:
def AssertAlmostEqual1(sequence, value):
    score = calc_norm_score( pssmKozak, sequence )
    df = pd.DataFrame(columns = ['Sequence', 'Position','Score', 'AlmostEqual'])
    position = 0
    for score in np.nditer(s):
        alequal = isclose(score,value, abs_tol=1e-7)
        df = df.append({'Sequence': "test_seq", 
                    'Position': position,
                    'Score': score,
                    'AlmostEqual': alequal}, ignore_index=True)
        position = position + 1 
    return (df)

In [18]:
AssertAlmostEqual1(test_seq, 0.3800000)

Unnamed: 0,Sequence,Position,Score,AlmostEqual
0,test_seq,0,0.3798483,False
1,test_seq,1,0.4352925,False
2,test_seq,2,0.15832752,False
3,test_seq,3,0.553093,False
4,test_seq,4,0.29560044,False
5,test_seq,5,0.33615193,False
6,test_seq,6,0.59644955,False
7,test_seq,7,0.33142576,False
8,test_seq,8,0.6381159,False
9,test_seq,9,0.3224843,False


In [62]:
# get_frame_score coputes the scores of the ATGs 
def get_frame_score (sequenceID, sequence, motifAtg, motifPssmKozak): 
    # Get the df with ATGs positions and frames "sequence, ATGnumber, position, frame"
    dfFrame= get_pos_frame("test_seq",sequence, motifAtg)
    # Add the column score to dfFrame
    dfFrame = dfFrame.assign(Score = 0.00000000)
    # Get the df with score of posistions "sequence, sequence, position, secore"
    dfScore = get_motif_score ("test_seq",sequence, motifPssmKozak)
    # Get the positions from dfFrame
    positions = dfFrame['Position'].values.tolist()
    
    # fill the column score score ATG = Score (A)+ Score (T) + Score (G) 
    for position in positions: 
        score = dfScore[(dfScore['Position'] == position)].at[position, 'Score'] + dfScore[(dfScore['Position'] == position+1)].at[position+1, 'Score'] + dfScore[(dfScore['Position'] == position+2)].at[position+2, 'Score']
        dfFrame.loc[dfFrame.Position == position, 'Score'] = score
    dfFrameScore = dfFrame
    
    return (dfFrameScore)

In [63]:
get_frame_score ("test_seq", test_seq, motifATG, pssmKozak)

Unnamed: 0,Sequence,ATGNumber,Position,Frame,Score
0,test_seq,1,1,1,1.146713
1,test_seq,2,4,1,1.228202


# Run the output_pos_frame with homologous species (ATGs and their positions)

In [None]:
# test the output_pos_frame function with 10 sequences from H99
output_pos_frame ('Sequences/test_10_H99_CDS.fa','Output/ATG_pos_frame_species/test_10_ATG_pos_frame_H99_CDS.csv')

In [None]:
# output_pos_frame Ashbya_gossypii
output_pos_frame ('Sequences/Ashbya_gossypii.ASM9102v1.cds.all.fa','Output/ATG_pos_frame_species/ATG_pos_frame_Ashbya_gossypii.ASM9102v1.cds.all.csv')

In [None]:
#output_pos_frame Aspergillus_nidulans
output_pos_frame ('Sequences/Aspergillus_nidulans.ASM1142v1.cds.all.fa','Output/ATG_pos_frame_species/ATG_pos_frame_Aspergillus_nidulans.ASM1142v1.cds.all.csv')

In [None]:
#output_pos_frame Aspergillus_nidulans
output_pos_frame ('Sequences/Aspergillus_nidulans.ASM1142v1.cds.all.fa','Output/ATG_pos_frame_species/ATG_pos_frame_Aspergillus_nidulans.ASM1142v1.cds.all.csv')

In [None]:
#output_pos_frame Aspergillus_nidulans
output_pos_frame ('Sequences/Neosartorya_fischeri.CADRE.cds.all.fa','Output/ATG_pos_frame_species/ATG_pos_frame_Neosartorya_fischeri.CADRE.cds.all .csv')

In [None]:
#output_pos_frame Neurospora_crassa
output_pos_frame ('Sequences/Neurospora_crassa.NC12.cds.all.fa','Output/ATG_pos_frame_species/ATG_pos_frame_Neurospora_crassa.NC12.cds.all.csv')

# Run the output_motif_score with homologous species (pssmKozak score of each nucleotide) 

In [None]:
output_motif_score (pssmKozak, '/media/amina/Elements/Computational biology projects/AltTranslationInitiation/Sequences/Ashbya_gossypii.ASM9102v1.cds.all.fa', '/media/amina/Elements/Computational biology projects/AltTranslationInitiation/Output/motif_score_species/motif_score_Ashbya_gossypii.ASM9102v1.cds.all.csv' )  