# Open Reading Frames
https://rosalind.info/problems/orf/b

## My solution

### Practice 

In [1]:
from util import read_file_into_lines
from util import text_to_fasta
from util import codon_dict

In [2]:
s = "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"

In [3]:
counter = 0
peptide = ""
start_codon = "ATG"
stop_codons = ["TAG", "TGA", "TAA"]

In [4]:
# this did not work but it was a preliminary thinking
for i in range(len(s)):
    if s[i:i+3] == start_codon:
        print("start codon", s[i:i+3])       
    elif s[i:i+3] in stop_codons:
        print("end codon", s[i:i+3])
        peptide += s[i:i+3]
        break
    else: 
        print(s[i:i+3])
        peptide += s[i:i+3]

AGC
GCC
CCA
CAT
start codon ATG
TGT
GTA
end codon TAG


In [5]:
def find_candidate_string(dna_string):
    current_list = []
    for i in range(len(dna_string)):
        if dna_string[i:i+3] == "ATG":
            candidate_string = ""
            for j in range(i, len(dna_string), 3):
                codon = dna_string[j:j+3]
                if codon in stop_codons:
                    current_list.append(candidate_string)
                    break
                else: 
                    candidate_string += dna_string[j:j+3]
    return(current_list)

In [6]:
candidate_list = find_candidate_string(s)

In [7]:
candidate_list

['ATG',
 'ATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAA',
 'ATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAA']

In [8]:
sc = s[::-1].replace("A","t").replace("T", "a").replace("C", "g").replace("G", "c").upper()

candidate_list.extend(find_candidate_string(sc))
print(candidate_list)

['ATG', 'ATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAA', 'ATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAA', 'ATGCTACTCGGATCATTCAGGCTTATTCCAAAAGAGACTCTAATCCAAGTCGCGGGGTCATCCCCATGTAACCTGAGT', 'ATG']


In [9]:
def translate_dna_rna(candidate_list):
    rna_list = []
    for i in candidate_list:
        rna = ""
        for nucl in i:
            if nucl == "T":
                rna = rna + "U"
            else:
                rna = rna + nucl
        rna_list.append(rna)
    return(rna_list)

In [10]:
rna_list = translate_dna_rna(candidate_list)

In [11]:
rna_list

['AUG',
 'AUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAA',
 'AUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAA',
 'AUGCUACUCGGAUCAUUCAGGCUUAUUCCAAAAGAGACUCUAAUCCAAGUCGCGGGGUCAUCCCCAUGUAACCUGAGU',
 'AUG']

In [12]:
def from_rna_to_protein(rna_list):
    protein_list = []
    for i in rna_list:
        counter = 0 
        aa = ""
        for codon in i:
            triplet = i[counter:counter + 3] 
            if triplet == '': 
                break
            else:
                aa = aa + codon_dict[triplet] 
                counter = counter + 3
        protein_list.append(aa)
    return(protein_list)

In [13]:
from_rna_to_protein(rna_list)

['M', 'MGMTPRLGLESLLE', 'MTPRLGLESLLE', 'MLLGSFRLIPKETLIQVAGSSPCNLS', 'M']

### All together 


In [14]:
fasta_file = read_file_into_lines("rosalind_orf.txt")

In [15]:
#now we can put all the functions together into one program so that we can input a rasta file and have a resulting list of protein chains

def fasta_to_protein(fasta_file):
    dna_list = text_to_fasta(fasta_file)
    for dna_string in dna_list.values(): 
        candidate_list = find_candidate_string(dna_string)
        sc = dna_string[::-1].replace("A","t").replace("T", "a").replace("C", "g").replace("G", "c").upper()
        candidate_list.extend(find_candidate_string(sc))
    rna_list = translate_dna_rna(candidate_list)
    protein_list = from_rna_to_protein(rna_list)
    return protein_list

In [16]:
final_solution = fasta_to_protein(fasta_file)
for protein in final_solution:
    print(protein)

MPSLKSYGEQK
MRSH
MLAAERARRSSDTSTANCVLPSC
MSLLHAVNRNLTSPAPLILDPDHELSAGPLHIGKITAEVSVQPRVG
M
MIILGGGGIQKGQNFVYRPIYHHCD
MHYPPLLAQEKFPTSSGSRLAFP
MSLLITITFCVTPLGSEPMVVVRPVTHVVAR
MVVVRPVTHVVAR
MLGGSH
MHPLPIAKVQDGLGFVLV
MSPSMNRTTMRLITMVIDRTINKVLTFLNSSAT
MNRTTMRLITMVIDRTINKVLTFLNSSAT
MRLITMVIDRTINKVLTFLNSSAT
MVIDRTINKVLTFLNSSAT
MSYVIANVRDATMARCS
MRQWRDVAKQRLFERLAKRSKSRCLATFFLANSRLNAYLGCNLTNVKRSSRQFVVGV
MARCS
M
M
MCPATRLPV
ML
MGTHPR


### How do I avoid having double entries for proteins that already exist? 

## In class solution 

**Given**: A DNA sequence in FASTA format

**Asked:** all possible peptides that can be translated from it

### The Plan

- [x] transcribe the DNA to RNA (RNA exercise)
- [x] find all reading frames $\rightarrow$ three forward, three reverse $\rightarrow$ REVC exercise
- [x] find valid ORFs in an RNA sequence <font color="red">!!!!</font>
- [x] translate the mRNAs to peptides (PROT exercise)
- [x] make sure there are no duplicate protein sequences!

Ideas:

- much like in PROT, we might need to work with triplets
- whatever we do for a single reading frame, we will have to repeat 5x more $\rightarrow$ functions?
    - function that takes RNA sequence and produces all valid peptides

### Set up environment

- [x] load helper functions
- [x] get testing data

In [17]:
from util import read_file_into_lines, read_fasta, reverse_complement
from util import codon_dict

In [18]:
fasta_file = read_fasta('orf_trial.txt')
fasta = list(fasta_file.values())[0]

In [19]:
fasta

'AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG'

### 1. transcribe to RNA

In [20]:
rna = fasta.replace('T', 'U')

### 2. find all reading frames

In [21]:
def get_reading_frames(rna):
    # three forward, three backward reading frames
    rf1 = rna
    rf2 = rna[1:]
    rf3 = rna[2:]

    revc = reverse_complement(rna)
    rf4 = revc
    rf5 = revc[1:]
    rf6 = revc[2:]
    return [rf1, rf2, rf3, rf4, rf5, rf6]

In [22]:
frames = get_reading_frames(rna)
frames

['AGCCAUGUAGCUAACUCAGGUUACAUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAAUAAGCCUGAAUGAUCCGAGUAGCAUCUCAG',
 'GCCAUGUAGCUAACUCAGGUUACAUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAAUAAGCCUGAAUGAUCCGAGUAGCAUCUCAG',
 'CCAUGUAGCUAACUCAGGUUACAUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAAUAAGCCUGAAUGAUCCGAGUAGCAUCUCAG',
 'CUGAGAUGCUACUCGGAUCAUUCAGGCUUAUUCCAAAAGAGACUCUAAUCCAAGUCGCGGGGUCAUCCCCAUGUAACCUGAGUUAGCUACAUGGCU',
 'UGAGAUGCUACUCGGAUCAUUCAGGCUUAUUCCAAAAGAGACUCUAAUCCAAGUCGCGGGGUCAUCCCCAUGUAACCUGAGUUAGCUACAUGGCU',
 'GAGAUGCUACUCGGAUCAUUCAGGCUUAUUCCAAAAGAGACUCUAAUCCAAGUCGCGGGGUCAUCCCCAUGUAACCUGAGUUAGCUACAUGGCU']

### 3. finding ORFs

In [23]:
# just to practice, use RF1
rf = frames[0]

I notice that I cannot solve this in one go; the problem are overlapping peptides. What is the
problem  about them? They share a stop codon, but have different start codons $\rightarrow$ there
is no solution with just one for loop.

My idea:

- [x] find all start codons
- [x] given a start codon, find the correspodnign ORF

In [24]:
def find_AUG(rf):
    starting_positions = []
    for i in range(0, len(rf), 3):
        codon = rf[i:i+3]
        if codon == 'AUG':
            starting_positions.append(i)
    return starting_positions

In [25]:
starts = find_AUG(rf)
starts

[24, 30, 75]

In [74]:
def find_ORF_and_translate(rf, start, codons):
    peptide = ''
    for i in range(start, len(rf), 3):
        codon = rf[i:i+3]
        if len(codon) < 3:
            continue
        if codons[codon] is None:
            return peptide
        peptide += codons[codon]
    return None

negative test: this sequence contains a start codon but no end codon; our function should return
None

In [75]:
neg = 'AUGGGGAUGACCCCGCGACUUGGAUUAGAGUCUCUUUUGGAA'
peptide = find_ORF_and_translate(neg, 0, codon_dict)
print(peptide)

None


In [76]:
def find_all_proteins(rf, codons):
    starts = find_AUG(rf)
    proteins = []
    for start in starts:
        peptide = find_ORF_and_translate(rf, start, codons)
        if peptide is not None:
            proteins.append(peptide)
    return proteins

In [77]:
find_all_proteins(frames[0], codon_dict)

[]

### Putting it all together:

In [78]:
def all_possible_proteins(fasta_path, codons):
    # 1. read the file
    fasta_file = read_fasta(fasta_path)
    fasta = list(fasta_file.values())[0]
    # 2. transcribe to RNA
    rna = fasta.replace('T', 'U')
    # 3. get reading frames
    frames = get_reading_frames(rna)
    proteins = []
    for frame in frames:
        to_add = find_all_proteins(frame, codons)
        # make sure the new proteins are not
        # already in our list
        for prot in to_add:
            if prot not in proteins:
                proteins.extend([prot])

    print('\n'.join(proteins))

### Solve example

In [79]:
all_possible_proteins('orf_trial.txt', codon_dict)




### Solve exercise

In [80]:
all_possible_proteins('rosalind_orf.txt', codon_dict)




For some reason, this is not working for me. Might be useful to try with Niko's notebook and util.py file.