In [1]:
import sys
sys.path.append("..")
from rosalind_tools.config import *
from Bio.Data import CodonTable
from rosalind_tools.utils import parse_fasta, translation, rc_DNA
from typing import List
import re

Given: A DNA string s of length at most 1 kbp in FASTA format.

Return: Every distinct candidate protein string that can be translated from ORFs of s. Strings can be returned in any order.  

In [2]:
def orf(seq: str) -> str:
    start_pattern = re.compile(r'(?=(ATG))')
    protein_seq = []
    for s in [seq, rc_DNA(seq)]:
        index_start = [int(m.start()) for m in start_pattern.finditer(s)]
        for i in index_start:
            p = translation(s[i:])
            # if translation function return None indicates it's truncated and not end with a stop codon.
            if p != None:
                protein_seq.append(p)
    # print out only unique protein sequences
    for p in set(protein_seq):
        print(p)
    return
            
    

In [3]:
# Try sample dataset
seq = 'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'
orf(seq)

MLLGSFRLIPKETLIQVAGSSPCNLS
M
MTPRLGLESLLE
MGMTPRLGLESLLE


In [4]:
# Try Rosalind dataset
with open(data_dir/'rosalind_orf.txt', 'r') as f:
    records = parse_fasta(f)
    for record in records:
        orf(record.seq)

MDGRAGLGCALGHPSRGGRTGANGGNYLYT
MVAMRLPPDPGEFTLTITQPDLYSPATLNAVLPTKQGLHGKQGTTSCVFRKTVL
MS
M
MPSFQQSKVYMGNKELPVASLEKLFCERS
MGWPRLGRNARRK
MVP
MFLKYINSFLR
MGNKELPVASLEKLFCERS
MRWDCE
MTSGQSSVAGLIPTEHKPNGLATAWKECPTEVTLDHK
MYDRR
MTCGTYSFSFT
MLWYSYSCEDGWTCWARLCLGTPVKGRTHGS
MDVLGSVVPWDTRQGADARELTEETIYILEEHL
MLCWD
MI
MPDGSNVRS
MIAANTRYTAD
MRLPPDPGEFTLTITQPDLYSPATLNAVLPTKQGLHGKQGTTSCVFRKTVL
MA
MLTRINRVREEEVCLPATMTCGTYSFSFT
MAPDGLTSITHNPTALAAPTTLAGSPLIVDLRASDCGSLMI
MSPCYDDLWHLLLLVYLRYHPTLQGE
MKDESLFICTEWLLCVFLQIQVNLR
MSRYLFAQNGCYASSSRSR
