# RNA Splicing
### Problem
After identifying the exons and introns of an RNA string, we only need to delete the introns and concatenate the exons to form a new string ready for translation.

### Given: 
A DNA string s (of length at most 1 kbp) and a collection of substrings of s acting as introns. All strings are given in FASTA format.

### Return: 
A protein string resulting from transcribing and translating the exons of s. (Note: Only one solution will exist for the dataset provided.)



In [1]:
import numpy as np

def read_FASTA(file):
    data = np.genfromtxt(file,dtype=str)
    names = []
    strings = []
    y = []
    for x in range(len(data)):
        if data[x][0]  == ">": y.append(x)

    for x in range(len(y)):
        if x == len(y)-1: 
            strings.append("".join(data[y[x]+1:]))
            names.append(data[y[x]][1:])
        else: 
            strings.append("".join(data[y[x]+1:y[x+1]]))
            names.append(data[y[x]][1:])
            
    return strings

In [3]:
codon_table = {
    'UUU': 'F',     'CUU': 'L',     'AUU': 'I',     'GUU': 'V',
    'UUC': 'F',     'CUC': 'L',     'AUC': 'I',     'GUC': 'V',
    'UUA': 'L',     'CUA': 'L',     'AUA': 'I',     'GUA': 'V',
    'UUG': 'L',     'CUG': 'L',     'AUG': 'M',     'GUG': 'V',
    'UCU': 'S',     'CCU': 'P',     'ACU': 'T',     'GCU': 'A',
    'UCC': 'S',     'CCC': 'P',     'ACC': 'T',     'GCC': 'A',
    'UCA': 'S',     'CCA': 'P',     'ACA': 'T',     'GCA': 'A',
    'UCG': 'S',     'CCG': 'P',     'ACG': 'T',     'GCG': 'A',
    'UAU': 'Y',     'CAU': 'H',     'AAU': 'N',     'GAU': 'D',
    'UAC': 'Y',     'CAC': 'H',     'AAC': 'N',     'GAC': 'D',
    'UAA': 'Stop',  'CAA': 'Q',     'AAA': 'K',     'GAA': 'E',
    'UAG': 'Stop',  'CAG': 'Q',     'AAG': 'K',     'GAG': 'E',
    'UGU': 'C',     'CGU': 'R',     'AGU': 'S',     'GGU': 'G',
    'UGC': 'C',     'CGC': 'R',     'AGC': 'S',     'GGC': 'G',
    'UGA': 'Stop',  'CGA': 'R',     'AGA': 'R',     'GGA': 'G',
    'UGG': 'W',     'CGG': 'R',     'AGG': 'R',     'GGG': 'G'
}    

In [14]:
def to_rna(s):
    r = []
    for i in s: r.append("U") if i == "T" else r.append(i)
    return "".join(r)

def to_protein(r):
    protein = ""
    for i in range(0, len(r) - (len(r) % 3), 3):
        symbol = codon_table[r[i:i+3]]
        if symbol == 'Stop':
            break
        protein += symbol
    return protein

In [37]:
strings = read_FASTA("rosalind_splc.txt")
#strings = ['ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG','ATCGGTCGAA','ATCGGTCGAGCGTGT']
rna = []
for i in strings: rna.append(to_rna(i))

In [38]:
final = rna[0]
for x in range(len(rna)-1): final = final.replace(rna[x+1],"")
protein = to_protein(final)
print(protein)

MVAGRRRWSIRVSLTSYLRRTSLAQTKVPRLTVFTSRVDTHGRISTRTIYRDLFISRRRHSGTLIQSRTHSGRSSWFTGNTYACGQSTEKLRRYVPPKPVRSVGSGYSCLPLLTCELKVLYWPSRSLGQQGRPSLSFYLPLHRQLGAGGRVRWLQNSPGDMAITRCECRKPSYPRTVGLFLIARRADMHGQWVTLL
