# Open Reading Frames
### Problem
Either strand of a DNA double helix can serve as the coding strand for RNA transcription. Hence, a given DNA string implies six total reading frames, or ways in which the same region of DNA can be translated into amino acids: three reading frames result from reading the string itself, whereas three more result from reading its reverse complement.

An open reading frame (ORF) is one which starts from the start codon and ends by stop codon, without any other stop codons in between. Thus, a candidate protein string is derived by translating an open reading frame into amino acids until a stop codon is reached.

### Given: 
A DNA string s of length at most 1 kbp in FASTA format.

### Return: 
Every distinct candidate protein string that can be translated from ORFs of s. Strings can be returned in any order.



In [225]:
import numpy as np

def read_FASTA(file):
    data = np.genfromtxt(file,dtype=str)
    names = []
    strings = []
    y = []
    for x in range(len(data)):
        if data[x][0]  == ">": y.append(x)

    for x in range(len(y)):
        if x == len(y)-1: 
            strings.append("".join(data[y[x]+1:]))
            names.append(data[y[x]][1:])
        else: 
            strings.append("".join(data[y[x]+1:y[x+1]]))
            names.append(data[y[x]][1:])
            
    return strings

In [338]:
s = read_FASTA("rosalind_orf.txt")[0]
#s = "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"
r = []
for i in s: r.append("U") if i == "T" else r.append(i)
r = ''.join(r)

r_rev = r[::-1]
rc = []
for i in r_rev:
    if i == "A": rc.append("U")
    elif i == "U": rc.append("A")
    elif i == "G": rc.append("C")
    elif i == "C": rc.append("G")
rc = ''.join(rc)

In [339]:
codon_table = {
    'UUU': 'F',     'CUU': 'L',     'AUU': 'I',     'GUU': 'V',
    'UUC': 'F',     'CUC': 'L',     'AUC': 'I',     'GUC': 'V',
    'UUA': 'L',     'CUA': 'L',     'AUA': 'I',     'GUA': 'V',
    'UUG': 'L',     'CUG': 'L',     'AUG': 'M',     'GUG': 'V',
    'UCU': 'S',     'CCU': 'P',     'ACU': 'T',     'GCU': 'A',
    'UCC': 'S',     'CCC': 'P',     'ACC': 'T',     'GCC': 'A',
    'UCA': 'S',     'CCA': 'P',     'ACA': 'T',     'GCA': 'A',
    'UCG': 'S',     'CCG': 'P',     'ACG': 'T',     'GCG': 'A',
    'UAU': 'Y',     'CAU': 'H',     'AAU': 'N',     'GAU': 'D',
    'UAC': 'Y',     'CAC': 'H',     'AAC': 'N',     'GAC': 'D',
    'UAA': 'Stop',  'CAA': 'Q',     'AAA': 'K',     'GAA': 'E',
    'UAG': 'Stop',  'CAG': 'Q',     'AAG': 'K',     'GAG': 'E',
    'UGU': 'C',     'CGU': 'R',     'AGU': 'S',     'GGU': 'G',
    'UGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'UGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'UGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}    

In [340]:
protein = ""
record = "off"
for i in range(0, len(r)-(len(r) % 3), 3):
    if (r[i:i+3] == 'AUG') and (record == "off"):
        record = "on"
        protein += " Start"
    if record == "on": protein += codon_table[r[i:i+3]]
    if codon_table[r[i:i+3]] == "Stop":
        record = "off"
        protein += " "

record = "off"
for i in range(1, len(r)-((len(r)+2) % 3), 3):
    if (r[i:i+3] == 'AUG') and (record == "off"):
        record = "on"
        protein += " Start"
    if record == "on": protein += codon_table[r[i:i+3]]
    if codon_table[r[i:i+3]] == "Stop":
        record = "off"
        protein += " "

record = "off"
for i in range(2, len(r)-((len(r)+1) % 3), 3):
    if (r[i:i+3] == 'AUG') and (record == "off"):
        record = "on"
        protein += " Start"
    if record == "on": protein += codon_table[r[i:i+3]]
    if codon_table[r[i:i+3]] == "Stop":
        record = "off"
        protein += " "
        

In [341]:
record = "off"
for i in range(0, len(rc)-(len(rc) % 3), 3):
    if (rc[i:i+3] == 'AUG') and (record == "off"):
        record = "on"
        protein += " Start"
    if record == "on": protein += codon_table[rc[i:i+3]]
    if codon_table[rc[i:i+3]] == "Stop":
        record = "off"
        protein += " "

record = "off"
for i in range(1, len(rc)-((len(rc)+2) % 3), 3):
    if (rc[i:i+3] == 'AUG') and (record == "off"):
        record = "on"
        protein += " Start"
    if record == "on": protein += codon_table[rc[i:i+3]]
    if codon_table[rc[i:i+3]] == "Stop":
        record = "off"
        protein += " "

record = "off"
for i in range(2, len(rc)-((len(rc)+1) % 3), 3):
    if (rc[i:i+3] == 'AUG') and (record == "off"):
        record = "on"
        protein += " Start"
    if record == "on": protein += codon_table[rc[i:i+3]]
    if codon_table[rc[i:i+3]] == "Stop":
        record = "off"
        protein += " "

In [342]:
protein = protein.split(' ')

In [343]:
final = []
for x in protein:
    if (x[:5] == "Start") and (x[-4:] == "Stop"): final.append(x[5:-4])
        
for y in final:
    for i in range(1,len(y)):
        if y[i] == "M": final.append(y[i:])

In [344]:
for x in set(final): print(x)

MTAIGNPMVLTT
MRNVTVE
MANTVTNMRCL
MHEMPNSDSCQGGEHPVWKISPGKRGSAAISRTLSNFILVASWTWCGVSTRVKKL
MQRFLQDSLAIWLHAECDS
MRRPE
MVLTT
MHQPQRWY
MLVTVFAMLREFLRVNAITGMVSLRGA
MQPNSLRQPHRGHSRRLHE
M
MQFGKLGTASWDL
MSVSLSDVVVPGTHFERGLESKFETASEGMWLVPSAVP
MRCL
MHFNAIWQAWNSFVGFIGNVMKDPIRRVRSGERTRRSKFFDPRRDTAPRP
MKPNR
MDVVRCLDEGQKTLIAWYARHF
MPNSDSCQGGEHPVWKISPGKRGSAAISRTLSNFILVASWTWCGVSTRVKKL
MSSSLERISSAAWKVSLRRLARECGWCPRQCHSGKRLD
MWLVPSAVP
MLREFLRVNAITGMVSLRGA
MPLSKRPYP
MKDPIRRVRSGERTRRSKFFDPRRDTAPRP
MALPRAPTTFPR
MFAALARIGIRHLMHQPQRWY
MGHEATCPSLISSIQIKSRGLSASRIIAL
MMQPNSLRQPHRGHSRRLHE
MSYDLVKGIDPMMQPNSLRQPHRGHSRRLHE
MLSSINYSTVTFRMKPNR
MRSRDDDIREGN
MHCQRR
MKLLSVLLIAADPLLPGEIFQTGCSPPWHESELGISCINLKDGTNYLIAIR
MVSLRGA
MRCLIPIRARAANILFGRFHQAKGGQRLSVGH
