In [1]:
from utils import *

In [2]:
def load_coding_table():
    codon2aa = dict()
    with open("../data/RNA_codon_table_1.txt") as fin:
        for line in fin:
            try:
                codon, aa = line.strip().split(" ")
            except ValueError:
                codon = line.strip()
                aa = " "
            codon2aa[codon] = aa
    return codon2aa

CODON2AA = load_coding_table()

In [3]:
def translate_iter(sequence, codon2aa=CODON2AA):
    sequence = sequence.replace("T","U")
    for i in range(0, len(sequence)-2, 3):
        codon = sequence[i:i+3]
        try:
            aa = codon2aa[codon]
        except KeyError:
            print(f"skipping codon {codon} in {sequence} on position {i}")
            continue
        
        yield codon, aa
    
def translate(sequence, codon2aa=CODON2AA):
    return "".join((aa for _, aa in translate_iter(sequence, codon2aa)))

In [50]:
input = "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"
assert translate(input) == "MAMAPRTEINSTRING "

In [51]:
with open("../data/dataset_96_4.txt") as fin:
    sequence = fin.read().strip()

In [52]:
print(translate(sequence))

MPSTLPFRDRLTRSQRNGRGFLGAQYPRVDAPDTALQHGIRKRTPAYFSQCKKLGRIREVMTTLYGVSLLSVPNLHTHRRAVRETYGLLSLLPITGSAAPVNFSTTLWRFLMEARRVRFECVSDSGSGIRRICSKRLGLQSIDVAALQRIVYGAKGICLHQLYGMCKIVIRGHAVYGGCRVMLHKYEIEEIDQWFLAFSRDSDGGALTYPTLISFVLRSVTTIAIRPLVLPCLILFILNKESGAIAEVLHGVSILACDTVGLWHTSQSAKYSGLTPTLSDDGAGVSLSSCGQGMTVLCGKKAGTTVQAHSDAPELNHRSGSTAHRMSVVQSLREFSLIWGGDLPGPSCGNSAAIGFQPIVKQTGMVGLGSRTWTCHETGSCVETKVQVLQPLYFEVRCGYNTSTWSQRPPVSMYLKHVDRLRCVSKYSECAMPSDSARRTARRDTDIAFIVSTYLRCRMFSWPPEAGRLTASQGHADRPQVTTEVFSFLDASLVYEDWKKLYRPILVVTIGLPNAPEEIGHMLFTTCILDNPRLRPPRLAEVTAVTSPIATCSTTGCGRTADSFSGFPDCEVQSRHFPNLDILIAIFPIWLCSAVTGYLPSPNRLASSTTLLIFFNALRVSPLALMYGHFVRGNCWKKTLTLDHHKPIAELPFGRDMLSNISHWFVYEADHVQGIYRPLRSHEDCGDSLKSGERYCSLATLRKRPSLLPSDLKNPQADLAGSIALICYVSQLGLALRAQANNHEARHLVETSYDLELDTKIALRHAKSSARWKKRVRLRQTPWASFSTECNWALISGTTWLAYKSPMCIMGLKRTKFLGVDFERVHTRVRCNSYRSTCPQTANLLESQVWTSSIYIISYFAAESRLTCLRDLTHRLRYTFVVPRICCLLSIKIRREPVTEPPTSEASSCTNSPQRGSLEVQINGHASRIPIRRSSSPYKRVTAEGLWHSRSSVHGLGPRPASKLANQHSLRNGGRLSSWSGSWRESSSRVTATVAEAQEI

In [83]:
def find_encoding_aa(sequence, tsequence, peptide):
    sequence = sequence.replace("U","T")
    encoding_aas = []
    for i in range(len(tsequence)-len(peptide)+1):
        if tsequence[i:i+len(peptide)] == peptide:
            encoding_aa = sequence[i*3:(i+len(peptide))*3]
            encoding_aas.append(encoding_aa)
    return encoding_aas

def peptide_encoding(sequence, peptide):
    rsequence = reverse_complement(sequence)
    encoding_nucleotides =[]
    for i in range(3):
        tseq = translate(sequence[i:])
        encoding = find_encoding_aa(sequence[i:], tseq, peptide)
        encoding_nucleotides.extend(encoding)
        #print(sequence[i:], encoding)
        trseq = translate(rsequence[i:])
        encoding = [reverse_complement(a) for a in find_encoding_aa(rsequence[i:], trseq, peptide)]
        #print(rsequence[i:], encoding)
        encoding_nucleotides.extend(encoding)
    
    return encoding_nucleotides

In [84]:
input = "ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA"
peptide = "MA"

In [85]:
print(" ".join(peptide_encoding(input, peptide)))

ATGGCC ATGGCC GGCCAT


In [88]:
with open("../data/dataset_96_7.txt") as fin:
    sequence, peptide, _ = fin.read().split("\n")
    sequence = sequence.strip()
    peptide = peptide.strip()

In [89]:
print(" ".join(peptide_encoding(sequence, peptide)))

TGTTTTCCGGGCATGTACGTTGCGGGAATG TGTTTTCCCGGCATGTACGTGGCTGGAATG CATACCGGCAACATACATTCCAGGAAAGCA CATGCCCGCTACGTACATGCCCGGAAAGCA CATCCCAGCTACATACATCCCTGGGAAGCA CATCCCGGCTACGTACATTCCTGGGAAGCA CATACCTGCGACATACATGCCAGGGAAACA CATGCCAGCTACGTACATACCAGGAAAACA CATCCCTGCCACATACATTCCTGGGAAGCA CATACCTGCTACGTACATACCCGGAAAGCA CATCCCTGCAACATACATACCCGGAAAGCA TGTTTTCCGGGCATGTACGTTGCTGGCATG CATCCCGGCAACATACATGCCAGGGAAACA CATGCCTGCCACGTACATACCAGGAAAACA CATGCCCGCCACATACATTCCTGGAAAGCA TGCTTCCCGGGTATGTATGTTGCCGGAATG TGCTTCCCAGGGATGTACGTTGCCGGCATG TGTTTCCCGGGAATGTACGTAGCCGGAATG CATACCGGCTACATACATACCGGGGAAACA


In [4]:
### quiz

In [5]:
translate("CCUCGUACAGAAAUCAAC")

'PRTEIN'

In [6]:
translate("CCGAGGACCGAAAUCAAC")

'PRTEIN'

In [7]:
translate("CCAAGAACAGAUAUCAAU")

'PRTDIN'

In [8]:
translate("CCAAGUACAGAGAUUAAC")

'PSTEIN'

In [11]:
from collections import defaultdict
d = defaultdict(set)
for codon, aa in CODON2AA.items():
    d[aa].add(codon)
d

defaultdict(set,
            {'K': {'AAA', 'AAG'},
             'N': {'AAC', 'AAU'},
             'T': {'ACA', 'ACC', 'ACG', 'ACU'},
             'R': {'AGA', 'AGG', 'CGA', 'CGC', 'CGG', 'CGU'},
             'S': {'AGC', 'AGU', 'UCA', 'UCC', 'UCG', 'UCU'},
             'I': {'AUA', 'AUC', 'AUU'},
             'M': {'AUG'},
             'Q': {'CAA', 'CAG'},
             'H': {'CAC', 'CAU'},
             'P': {'CCA', 'CCC', 'CCG', 'CCU'},
             'L': {'CUA', 'CUC', 'CUG', 'CUU', 'UUA', 'UUG'},
             'E': {'GAA', 'GAG'},
             'D': {'GAC', 'GAU'},
             'A': {'GCA', 'GCC', 'GCG', 'GCU'},
             'G': {'GGA', 'GGC', 'GGG', 'GGU'},
             'V': {'GUA', 'GUC', 'GUG', 'GUU'},
             ' ': {'UAA', 'UAG', 'UGA'},
             'Y': {'UAC', 'UAU'},
             'C': {'UGC', 'UGU'},
             'W': {'UGG'},
             'F': {'UUC', 'UUU'}})

In [12]:
6*2*4*2*2*6

1152