# Sample MERS viral sequence

In [132]:
# sourced from https://www.ncbi.nlm.nih.gov/nuccore/KF745068.1
with open('MERS_sequence.txt', 'r') as input:
    mers_sequence = input.read()
       
print(mers_sequence)
mers_sequence_letters = re.sub('[^atcgu]', '', mers_sequence)
print(mers_sequence_letters[21608:21614])

ttgtggcatt aatttgcctg ctcatctagg cagtggacat atgctcaaca ctgggtataa
       61 ttctaattga atactatttt tcagttagag cgtcgtgtct cttgtacgtc tcggtcacaa
      121 tacacggttt cgtccggtgc gtggcaattc ggggcacatc atgtctttcg tggctggtgt
      181 gaccgcgcaa ggtgcgcgcg gtacgtatcg agcagcgctc aactctgaaa aacatcaaga
      241 ccatgtgtct ctaactgtgc cactctgtgg ttcaggaaac ctggttgaaa aactttcacc
      301 atggttcatg gatggcgaaa atgcctatga agtggtgaag gccatgttac ttaaaaagga
      361 gccacttctc tatgtgccca tccggctggc tggacacact agacacctcc caggtcctcg
      421 tgtatacctg gttgagaggc tcattgcttg tgaaaatcca ttcatggtta accaattggc
      481 ttatagctct agtgcaaatg gcagcttggt tggcacaact ttgcagggca agcctattgg
      541 tatgttcttc ccttatgaca tcgaacttgt cacaggaaag caaaatattc tcctgcgcaa
      601 gtatggccgt ggtggttatc actacacccc attccactat gagcgagaca acacctcttg
      661 ccctgagtgg atggacgatt ttgaggcgga tcctaaaggc aaatatgccc agaatctgct
      721 taagaagttg attggcggtg atgtcactcc agttgaccaa tacatgtgtg gcgttgatgg
      781 aaaacccatt a

# Investigating a Sequence String

## Transcription 

In [5]:
%run ../../../codons/genes.py

cd = Codons()
%time cd.transcribe(mers_sequence)

# validate the removal of thiamines
if re.search('t|T', cd.transcribed_sequence):
    print('-->ERROR: The sequence was not transcribed.')

cd.export()

The sequence is transcribed.
Wall time: 2 ms


##  Translation

### 1st Open Reading Frame

In [155]:
%run ../../../codons/genes.py
from pprint import pprint
import re

cd = Codons()
%time cd.translate(mers_sequence, organism = 'virus', open_reading_frames = False)

# determine the proportion of perfectly translated proteins
proteins = [cd.genes[gene]['protein']['sequence'] for gene in cd.genes]
proteins_str = ' '.join(proteins)
sequences, description, fasta = cd.read_fasta('MERS_proteins.fasta')
unmatched = []
unmatched_length = sequences_length = 0
for protein in sequences:
    sequences_length += len(protein)
    if not re.search(protein, proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
        unmatched_length += len(protein)
        print(f'\n\nThe reported protein {protein} is not perfectly matched by a translated proteins.')   
print(f'\n\n{(1-(len(unmatched)/len(sequences)))*100}% of reported proteins perfectly matched predicted proteins.')

# determine the proportion of translated residues that are reported
reported_proteins_str = ' '.join(unmatched)
unmatched = []
for protein in proteins:
    sequences_length += len(protein)
    if not re.search(protein, reported_proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
    else:
        unmatched_length -= len(protein)
print(f'\n\n{(1-(unmatched_length/sequences_length))*100}% of the reported peptide sequences are predicted.')   
print(f'\n\n{(len(unmatched)/len(proteins))*100}% false predictions of proteins')

# cd.export()

  self.amino_acid_masses = CaseInsensitiveDict(json.load(open(masses_path)))


>Protein - 4391_residues - 565,150_amu - 163-13336 bp - (+1) ORF
MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLAGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTPFHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGGDVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTINSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGNAIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVAYAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKIRQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYYAHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLLDLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTTKAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEEDGFAFYPACTNGHAVPTLFRLKGGAPVKKVAFGGDQVHEVAAVRSVTVEYNIHAVLDTLLASSSLRTFVVDKSLSIEEFADVVKEQVSDLLVKLLRGMPIPDFDLD

### 3 Open Reading Frames

In [154]:
%run ../../../codons/genes.py
from pprint import pprint
import re

cd = Codons()
%time cd.translate(mers_sequence, organism = 'virus')

# determine the proportion of perfectly translated proteins
proteins = [cd.genes[gene]['protein']['sequence'] for gene in cd.genes]
proteins_str = ' '.join(proteins)
sequences, description, fasta = cd.read_fasta('MERS_proteins.fasta')
unmatched = []
unmatched_length = sequences_length = 0
for protein in sequences:
    sequences_length += len(protein)
    if not re.search(protein, proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
        unmatched_length += len(protein)
        print(f'\n\nThe reported protein {protein} is not perfectly matched by a translated proteins.')   
print(f'\n\n{(1-(len(unmatched)/len(sequences)))*100}% of reported proteins perfectly matched predicted proteins.')

# determine the proportion of translated residues that are reported
reported_proteins_str = ' '.join(unmatched)
unmatched = []
for protein in proteins:
    sequences_length += len(protein)
    if not re.search(protein, reported_proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
    else:
        unmatched_length -= len(protein)
print(f'\n\n{(1-(unmatched_length/sequences_length))*100}% of the reported peptide sequences are predicted.')   
print(f'\n\n{(len(unmatched)/len(proteins))*100}% false predictions of proteins')

# cd.export()

  self.amino_acid_masses = CaseInsensitiveDict(json.load(open(masses_path)))


>Protein - 4391_residues - 565,150_amu - 163-13336 bp - (+1) ORF
MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLAGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTPFHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGGDVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTINSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGNAIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVAYAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKIRQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYYAHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLLDLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTTKAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEEDGFAFYPACTNGHAVPTLFRLKGGAPVKKVAFGGDQVHEVAAVRSVTVEYNIHAVLDTLLASSSLRTFVVDKSLSIEEFADVVKEQVSDLLVKLLRGMPIPDFDLD

### All possible proteins

In [153]:
%run ../../../codons/genes.py
from pprint import pprint
import re

cd = Codons()
%time cd.translate(mers_sequence, organism = 'virus', all_possible_proteins = True)

# determine the proportion of perfectly translated proteins
proteins = [cd.genes[gene]['protein']['sequence'] for gene in cd.genes]
proteins_str = ' '.join(proteins)
sequences, description, fasta = cd.read_fasta('MERS_proteins.fasta')
unmatched = []
unmatched_length = sequences_length = 0
for protein in sequences:
    sequences_length += len(protein)
    if not re.search(protein, proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
        unmatched_length += len(protein)
        print(f'\n\nThe reported protein {protein} is not perfectly matched by a translated proteins.')   
print(f'\n\n{(1-(len(unmatched)/len(sequences)))*100}% of reported proteins perfectly matched predicted proteins.')

# determine the proportion of translated residues that are reported
reported_proteins_str = ' '.join(unmatched)
unmatched = []
for protein in proteins:
    sequences_length += len(protein)
    if not re.search(protein, reported_proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
    else:
        unmatched_length -= len(protein)
print(f'\n\n{(1-(unmatched_length/sequences_length))*100}% of the reported peptide sequences are predicted.')   
print(f'\n\n{(len(unmatched)/len(proteins))*100}% false predictions of proteins')

# cd.export()

  self.amino_acid_masses = CaseInsensitiveDict(json.load(open(masses_path)))


>Protein - 4391_residues - 565,150_amu - 163-13336 bp - (+0) ORF
MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLAGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTPFHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGGDVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTINSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGNAIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVAYAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKIRQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYYAHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLLDLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTTKAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEEDGFAFYPACTNGHAVPTLFRLKGGAPVKKVAFGGDQVHEVAAVRSVTVEYNIHAVLDTLLASSSLRTFVVDKSLSIEEFADVVKEQVSDLLVKLLRGMPIPDFDLD



100.0% of the reported peptide sequences are predicted.


100.0% false predictions of proteins


### complementary strand

In [152]:
%run ../../../codons/genes.py
from pprint import pprint
import re

cd = Codons()
%time cd.translate(mers_sequence, organism = 'virus', sense_strand_translation = True)

# determine the proportion of perfectly translated proteins
proteins = [cd.genes[gene]['protein']['sequence'] for gene in cd.genes]
proteins_str = ' '.join(proteins)
sequences, description, fasta = cd.read_fasta('MERS_proteins.fasta')
unmatched = []
unmatched_length = sequences_length = 0
for protein in sequences:
    sequences_length += len(protein)
    if not re.search(protein, proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
        unmatched_length += len(protein)
        print(f'\n\nThe reported protein {protein} is not perfectly matched by a translated proteins.')   
print(f'\n\n{(1-(len(unmatched)/len(sequences)))*100}% of reported proteins perfectly matched predicted proteins.')

# determine the proportion of translated residues that are reported
reported_proteins_str = ' '.join(unmatched)
unmatched = []
for protein in proteins:
    sequences_length += len(protein)
    if not re.search(protein, reported_proteins_str, flags = re.IGNORECASE):
        unmatched.append(protein)
    else:
        unmatched_length -= len(protein)
print(f'\n\n{(1-(unmatched_length/sequences_length))*100}% of the reported peptide sequences are predicted.')   
print(f'\n\n{(len(unmatched)/len(proteins))*100}% false predictions of proteins')

# cd.export()

  self.amino_acid_masses = CaseInsensitiveDict(json.load(open(masses_path)))


>Protein - 4391_residues - 565,150_amu - 163-13336 bp - (+1) ORF
MSFVAGVTAQGARGTYRAALNSEKHQDHVSLTVPLCGSGNLVEKLSPWFMDGENAYEVVKAMLLKKEPLLYVPIRLAGHTRHLPGPRVYLVERLIACENPFMVNQLAYSSSANGSLVGTTLQGKPIGMFFPYDIELVTGKQNILLRKYGRGGYHYTPFHYERDNTSCPEWMDDFEADPKGKYAQNLLKKLIGGDVTPVDQYMCGVDGKPISAYAFLMAKDGITKLADVEADVAARADDEGFITLKNNLYRLVWHVERKDVPYPKQSIFTINSVVQKDGVENTPPHYFTLGCKILTLTPRNKWSGVSDLSLKQKLLYTFYGKESLENPTYIYHSAFIECGSCGNDSWLTGNAIQGFACGCGASYTANDVEVQSSGMIKPNALLCATCPFAKGDSCSSNCKHSVAQLVSYLSERCNVIADSKSFTLIFGGVAYAYFGCEEGTMYFVPRAKSVVSRIGDSIFTGCTGSWNKVTQIANMFLEQTQHSLNFVGEFVVNDVVLAILSGTTTNVDKIRQLLKGVTLDKLRDYLADYDVAVTAGPFMDNAINVGGTGLQYAAITAPYVVLTGLGESFKKVATIPYKVCNSVKDTLTYYAHSVLYRVFPYDMDSGVSSFSELLFDCVDLSVASTYFLVRLLQDKTGDFMSTIITSCQTAVSKLLDTCFEATEATFNFLLDLAGLFRIFLRNAYVYTSQGFVVVNGKVSTLVKQVLDLLNKGMQLLHTKVSWAGSNISAVIYSGRESLIFPSGTYYCVTTKAKSVQQDLDVILPGEFSKKQLGLLQPTDNSTTVSVTVSSNMVETVVGQLEQTNMHSPDVIVGDYVIISEKLFVRSKEEDGFAFYPACTNGHAVPTLFRLKGGAPVKKVAFGGDQVHEVAAVRSVTVEYNIHAVLDTLLASSSLRTFVVDKSLSIEEFADVVKEQVSDLLVKLLRGMPIPDFDLD

# Brainstorming

In [66]:
# test ORFIpy
from pprint import pprint
import orfipy_core

max_sequences = {}
descriptions = {}
for start, stop, strand, description in orfipy_core.orfs(mers_sequence.upper(), starts=['ATG']):
    for info in description.split(';'):
        if '=' in info:
            info = info.split('=')
            name, content = info[0], info[1]
            if name == 'ID':
                id_content = int(content.split('.')[1])
                descriptions[id_content] = {}
                continue
        else:
            info = info.split(':')
            name, content = info[0], info[1]
        descriptions[id_content][name] = content
    sequence_len = int(descriptions[id_content]['ORF_len'])
    if len(max_sequences) == 60:
        min_sequence = min(max_sequences)
        if sequence_len > min_sequence:
            max_sequences.pop(min_sequence)
            max_sequences[sequence_len] = id_content
    else:
        max_sequences[sequence_len] = id_content
    descriptions[id_content]['start'] = start
    descriptions[id_content]['stop'] = stop

# assess the uniqueness of the predicted sequences
max_sequences = dict(sorted(max_sequences.items()))
for seq, index in max_sequences.items():
    print(descriptions[index])
    
print('\npos_ranges')
pos_ranges = []
for seq, index in max_sequences.items():
    if int(descriptions[index]['ORF_frame']) > 0: 
        start = descriptions[index]['start']
        stop = descriptions[index]['stop']
        if any(stop in rng for rng in pos_ranges):
            print(f'{index} ({start}-{stop}) overlaps with another frame')
        elif any(start in rng for rng in pos_ranges):
            print(f'{index} ({start}-{stop}) overlaps with another frame')
        else:
            pos_ranges.append(range(start, stop))
print(pos_ranges)

print('\nneg_ranges')
neg_ranges = []
for seq, index in max_sequences.items():
    if int(descriptions[index]['ORF_frame']) < 0: 
        start = descriptions[index]['start']
        stop = descriptions[index]['stop']
        if any(stop in rng for rng in neg_ranges):
            print(f'{index} ({start}-{stop}) overlaps with another frame')
        elif any(start in rng for rng in neg_ranges):
            print(f'{index} ({start}-{stop}) overlaps with another frame')
        else:
            neg_ranges.append(range(start, stop))
print(neg_ranges)
    
# print('1')
# for seq, index in max_sequences.items():
#     if int(descriptions[index]['ORF_frame']) == 1:
#         print(descriptions[index]['start'], descriptions[index]['stop'])

# print('2')        
# for seq, index in max_sequences.items():
#     if int(descriptions[index]['ORF_frame']) == 2:
#         print(descriptions[index]['start'], descriptions[index]['stop'])
        
# print('3')
# for seq, index in max_sequences.items():
#     if int(descriptions[index]['ORF_frame']) == 3:
#         print(descriptions[index]['start'], descriptions[index]['stop'])
        
# print('-1')
# for seq, index in max_sequences.items():
#     if int(descriptions[index]['ORF_frame']) == -1:
#         print(descriptions[index]['start'], descriptions[index]['stop'])
        
# print('-2')
# for seq, index in max_sequences.items():
#     if int(descriptions[index]['ORF_frame']) == -2:
#         print(descriptions[index]['start'], descriptions[index]['stop'])
        
# print('-3')
# for seq, index in max_sequences.items():
#     if int(descriptions[index]['ORF_frame']) == -3:
#         print(descriptions[index]['start'], descriptions[index]['stop'])

{'ORF_type': 'complete', 'ORF_len': '123', 'ORF_frame': '-3', 'Start': 'ATG', 'Stop': 'TGA', 'start': 10353, 'stop': 10476}
{'ORF_type': 'complete', 'ORF_len': '126', 'ORF_frame': '-3', 'Start': 'ATG', 'Stop': 'TGA', 'start': 8805, 'stop': 8931}
{'ORF_type': 'complete', 'ORF_len': '129', 'ORF_frame': '-3', 'Start': 'ATG', 'Stop': 'TAA', 'start': 12438, 'stop': 12567}
{'ORF_type': 'complete', 'ORF_len': '132', 'ORF_frame': '-3', 'Start': 'ATG', 'Stop': 'TAA', 'start': 15717, 'stop': 15849}
{'ORF_type': 'complete', 'ORF_len': '135', 'ORF_frame': '-3', 'Start': 'ATG', 'Stop': 'TGA', 'start': 32196, 'stop': 32331}
{'ORF_type': 'complete', 'ORF_len': '138', 'ORF_frame': '-2', 'Start': 'ATG', 'Stop': 'TAG', 'start': 20995, 'stop': 21133}
{'ORF_type': 'complete', 'ORF_len': '141', 'ORF_frame': '3', 'Start': 'ATG', 'Stop': 'TAG', 'start': 34289, 'stop': 34430}
{'ORF_type': 'complete', 'ORF_len': '144', 'ORF_frame': '-3', 'Start': 'ATG', 'Stop': 'TGA', 'start': 23295, 'stop': 23439}
{'ORF_type'

In [68]:
%run ../../../codons/genes.py
cd = Codons()
cd.translate('''
    21301 tctctcctgt cgcagggtaa gttacttatc cgtgacaatg atacactcag tgtttctact
    21361 gatgttcttg ttaacaccta cagaaagtta cgttgatgta gggccagatt ctgttaagtc
    21421 tgcttgtatt gaggttgata tacaacagac tttctttgat aaaacttggc ctaggccaat
    21481 tgatgtttct aaggctgacg gtattatata ccctcaaggc cgtacatatt ctaacataac
    21541 tatcacttat caaggtcttt ttccctatca gggagaccat ggtgatatgt atgtttactc
    21601 tgcaggacat gctacaggca caactccaca aaagttgttt gtagctaact attctcagga
    21661 cgtcaaacag tttgctaatg ggtttgtcgt ccgtatagga gcagctgcct attccactgg
    21721 cactgttatt attagcccat ctaccagcgc tactatacga aaaatttacc ctgcttttat
    21781 gctgggttct tcagttggta atttctcaga tggtaaaatg ggccgcttct tcaatcatac
    21841 tctagttctt ttgcccgatg gatgtggcac tttacttaga gctttttatt gtattctaga
    21901 gcctcgctct ggaaatcatt gtcctgctgg caattcctat acttcttttg ccacttatca
    21961 cactcctgca acagattgtt ctgatggcaa ttacaatcgt aatgccagtc tgaactcttt
    22021 taaggagtat tttaatttac gtaactgcac ctttatgtac acttataaca ttaccgaaga
    22081 tgagatttta gagtggtttg gcattacaca aactgctcaa ggtgttcacc tcttctcatc
    22141 tcggtatgtt gatttgtacg gcggcaatat gtttcaattt gccaccttgc ctgtttatga
    22201 cactattaag tattactcta tcattcctca cagtattcgt tctatccaaa gtgatagaaa
    22261 agcttgggct gccttctacg tatataaact tcaaccgtta actttcctgt tggatttttc
    22321 tgttgatggt tatatacgca gagctataga ctgtggtttt aatgatttgt cacaactcca
    22381 ctgctcatat gaatccttcg atgttgaatc tggagtttat tcagtttcgt ctttcgaagc
    22441 aaaaccttct ggctcagttg tggaacaggc tgaaggtgtt gaatgtgatt tttcacctct
    22501 tctgtctggc acacctcctc aggtttataa tttcaagcgt ttggttttta ccaattgcaa
    22561 ttataatctt accaaattgc tttcactttt ttctgtgaat gattttactt gtagtcaaat
    22621 atctccagca gcaattgcta gcaactgtta ttcttcactg attttggatt acttttcata
    22681 cccacttagt atgaaatccg atctcagtgt tagttctgct ggtccaatat cccagtttaa
    22741 ttataaacag tccttttcta atcccacatg tttgatttta gcgactgttc ctcataacct
    22801 tactactatt actaagcctc ttaagtacag ctatattaac aagtgctctc gtcttctttc
    22861 tgatgatcgt actgaagtac ctcagttagt gaacgctaat caatactcac cctgtgtatc
    22921 cattgtccca tccactgtgt gggaagacgg tgattattat aggaaacaac tatctccact
    22981 tgaaggtggt ggctggcttg ttgctagtgg ctcaactgtt gccatgactg agcaattaca
    23041 gatgggcttt ggtattacag ttcaatatgg tacagacacc aatagtgttt gccccaagct
    23101 tgaatttgct aatgacacaa aaattgcctc tcaattaggc aattgcgtgg aatattccct
    23161 ctatggtgtt tcgggccgtg gtgtttttca gaattgcaca gctgtaggtg ttcgacagca
    23221 gcgctttgtt tatgatgcgt accagaattt agttggctat tattctgatg atggcaacta
    23281 ctactgtttg cgtgcttgtg ttagtgttcc tgtttctgtc atctatgata aagaaactaa
    23341 aacccacgct actctatttg gtagtgttgc atgtgaacac atttcttcta ccatgtctca
    23401 atactcccgt tctacgcgat caatgcttaa acggcgagat tctacatatg gcccccttca
    23461 gacacctgtt ggttgtgtcc taggacttgt taattcctct ttgttcgtag aggactgcaa
    23521 gttgcctctc ggtcaatctc tctgtgctct tcctgacaca cctagtactc tcacacctcg
    23581 cagtgtgcgc tctgttccag gtgaaatgcg cttggcatcc attgctttta atcatcccat
    23641 tcaggttgat caacttaata gtagttattt taaattaagt atacccacta atttttcctt
    23701 tggtgtgact caggagtaca ttcagacaac cattcagaaa gttactgttg attgtaaaca
    23761 gtacgtttgc aatggtttcc agaagtgtga gcaattactg cgcgagtatg gccagttttg
    23821 ttccaaaata aaccaggctc tccatggtgc caatttacgc caggatgatt ctgtacgtaa
    23881 tttgtttgcg agcgtgaaaa gctctcaatc atctcctatc ataccaggtt ttggaggtga
    23941 ctttaatttg acacttctag aacctgtttc tatatctact ggcagtcgta gtgcacgtag
    24001 tgctattgag gatttgctat ttgacaaagt cactatagct gatcctggtt atatgcaagg
    24061 ttacgatgat tgtatgcatc aaggtccagc atcagctcgt gatcttattt gtgctcaata
    24121 tgtggctggt tataaagtat tacctcctct tatggatgtt aatatggaag ccgcgtatac
    24181 ttcatctttg cttggcagca tagcaggtgt tggctggact gctggcttat cctcctttgc
    24241 tgctattcca tttgcacaga gtatctttta taggttaaac ggtgttggca ttactcaaca
    24301 ggttctttca gagaaccaaa agcttattgc caataagttt aatcaggctc tgggagctat
    24361 gcaaacaggc ttcactacaa ctaatgaagc ttttcggaag gttcaggatg ctgtgaacaa
    24421 caatgcacag gctctatcca aattagctag cgagctatct aatacttttg gtgctatttc
    24481 cgcctctatt ggagacatca tacaacgtct tgatgttctc gaacaggacg cccaaataga
    24541 cagacttatt aatggccgtt tgacaacact aaatgctttt gttgcacagc agcttgttcg
    24601 ttccgaatca gctgctcttt ccgctcaatt ggctaaagat aaagtcaatg agtgtgtcaa
    24661 ggcacaatcc aagcgttctg gattttgcgg tcaaggcaca catatagtgt cctttgttgt
    24721 aaatgcccct aatggccttt acttcatgca tgttggttat taccctagca accacattga
    24781 ggttgtttct gcttatggtc tttgcgatgc agctaaccct actaattgta tagcccctgt
    24841 taatggctac tttattaaaa ctaataacac taggattgtt gatgagtggt catatactgg
    24901 ctcgtccttc tattcacctg agcccatcac ctcccttaat actaagtatg ttgcaccaca
    24961 ggtgacatac caaaacattt ctactaacct ccctcctcct cttctcggca attccaccgg
    25021 gattgacttc caagatgagt tggatgagtt tttcaaaaat gttagcacca gtatacctaa
    25081 ttttggttct ctaacacaga ttaatactac attactcgat cttacctacg agatgttgtc
    25141 tcttcaacaa gttgttaaag cccttaatga gtcttacata gaccttaaag agcttggcaa
    25201 ttatacttat tacaacaaat ggccgtggta catttggctt ggtttcattg ctgggcttgt
    25261 tgccttagct ctatgcgtct tcttcatact gtgctgcact ggttgtggca caaactgtat
    25321 gggaaaactt aagtgtaatc gttgttgtga tagatacgag gaatacgacc tcgagccgca
    25381 taaggttcat gttcactaat taacgaacta tcaatgagag ttcaaagacc acccactctc
    25441 ttgttagtgt tctcactctc tcttttggtc actgcattct caaaacctct ctatgtacct''')

atgatacactcagtgtttctactgatgttcttgttaacacctacagaaagttacgttgatgtagggccagattctgttaagtctgcttgtattgaggttgatatacaacagactttctttgataaaacttggcctaggccaattgatgtttctaaggctgacggtattatataccctcaaggccgtacatattctaacataactatcacttatcaaggtctttttccctatcagggagaccatggtgatatgtatgtttactctgcaggacatgctacaggcacaactccacaaaagttgtttgtagctaactattctcaggacgtcaaacagtttgctaatgggtttgtcgtccgtataggagcagctgcctattccactggcactgttattattagcccatctaccagcgctactatacgaaaaatttaccctgcttttatgctgggttcttcagttggtaatttctcagatggtaaaatgggccgcttcttcaatcatactctagttcttttgcccgatggatgtggcactttacttagagctttttattgtattctagagcctcgctctggaaatcattgtcctgctggcaattcctatacttcttttgccacttatcacactcctgcaacagattgttctgatggcaattacaatcgtaatgccagtctgaactcttttaaggagtattttaatttacgtaactgcacctttatgtacacttataacattaccgaagatgagattttagagtggtttggcattacacaaactgctcaaggtgttcacctcttctcatctcggtatgttgatttgtacggcggcaatatgtttcaatttgccaccttgcctgtttatgacactattaagtattactctatcattcctcacagtattcgttctatccaaagtgatagaaaagcttgggctgccttctacgtatataaacttcaaccgttaactttcctgttggatttttctgttgatggttatatacgcagagctatagactgtggttttaatgatttgtcacaactccactgctcatatgaatccttcgatgttgaatctggagtttattcagtttcgtctttcgaagcaaaaccttctggctcagttgtggaacaggctgaaggtgttgaatgtgatttttcacctcttctgtctggcacacctcctcaggtttataatttcaagcgtttggtttttaccaattgcaattataatcttaccaaattgctttcacttttttctgtgaatgattttacttgtagtcaaatatctccagcagcaattgctagcaactgttattcttcactgattttggattacttttcatacccacttagtatgaaatccgatctcagtgttagttctgctggtccaatatcccagtttaattataaacagtccttttctaatcccacatgtttgattttagcgactgttcctcataaccttactactattactaagcctcttaagtacagctatattaacaagtgctctcgtcttctttctgatgatcgtactgaagtacctcagttagtgaacgctaatcaatactcaccctgtgtatccattgtcccatccactgtgtgggaagacggtgattattataggaaacaactatctccacttgaaggtggtggctggcttgttgctagtggctcaactgttgccatgactgagcaattacagatgggctttggtattacagttcaatatggtacagacaccaatagtgtttgccccaagcttgaatttgctaatgacacaaaaattgcctctcaattaggcaattgcgtggaatattccctctatggtgtttcgggccgtggtgtttttcagaattgcacagctgtaggtgttcgacagcagcgctttgtttatgatgcgtaccagaatttagttggctattattctgatgatggcaactactactgtttgcgtgcttgtgttagtgttcctgtttctgtcatctatgataaagaaactaaaacccacgctactctatttggtagtgttgcatgtgaacacatttcttctaccatgtctcaatactcccgttctacgcgatcaatgcttaaacggcgagattctacatatggcccccttcagacacctgttggttgtgtcctaggacttgttaattcctctttgttcgtagaggactgcaagttgcctctcggtcaatctctctgtgctcttcctgacacacctagtactctcacacctcgcagtgtgcgctctgttccaggtgaaatgcgcttggcatccattgcttttaatcatcccattcaggttgatcaacttaatagtagttattttaaattaagtatacccactaatttttcctttggtgtgactcaggagtacattcagacaaccattcagaaagttactgttgattgtaaacagtacgtttgcaatggtttccagaagtgtgagcaattactgcgcgagtatggccagttttgttccaaaataaaccaggctctccatggtgccaatttacgccaggatgattctgtacgtaatttgtttgcgagcgtgaaaagctctcaatcatctcctatcataccaggttttggaggtgactttaatttgacacttctagaacctgtttctatatctactggcagtcgtagtgcacgtagtgctattgaggatttgctatttgacaaagtcactatagctgatcctggttatatgcaaggttacgatgattgtatgcatcaaggtccagcatcagctcgtgatcttatttgtgctcaatatgtggctggttataaagtattacctcctcttatggatgttaatatggaagccgcgtatacttcatctttgcttggcagcatagcaggtgttggctggactgctggcttatcctcctttgctgctattccatttgcacagagtatcttttataggttaaacggtgttggcattactcaacaggttctttcagagaaccaaaagcttattgccaataagtttaatcaggctctgggagctatgcaaacaggcttcactacaactaatgaagcttttcggaaggttcaggatgctgtgaacaacaatgcacaggctctatccaaattagctagcgagctatctaatacttttggtgctatttccgcctctattggagacatcatacaacgtcttgatgttctcgaacaggacgcccaaatagacagacttattaatggccgtttgacaacactaaatgcttttgttgcacagcagcttgttcgttccgaatcagctgctctttccgctcaattggctaaagataaagtcaatgagtgtgtcaaggcacaatccaagcgttctggattttgcggtcaaggcacacatatagtgtcctttgttgtaaatgcccctaatggcctttacttcatgcatgttggttattaccctagcaaccacattgaggttgtttctgcttatggtctttgcgatgcagctaaccctactaattgtatagcccctgttaatggctactttattaaaactaataacactaggattgttgatgagtggtcatatactggctcgtccttctattcacctgagcccatcacctcccttaatactaagtatgttgcaccacaggtgacataccaaaacatttctactaacctccctcctcctcttctcggcaattccaccgggattgacttccaagatgagttggatgagtttttcaaaaatgttagcaccagtatacctaattttggttctctaacacagattaatactacattactcgatcttacctacgagatgttgtctcttcaacaagttgttaaagcccttaatgagtcttacatagaccttaaagagcttggcaattatacttattacaacaaatggccgtggtacatttggcttggtttcattgctgggcttgttgccttagctctatgcgtcttcttcatactgtgctgcactggttgtggcacaaactgtatgggaaaacttaagtgtaatcgttgttgtgatagatacgaggaatacgacctcgagccgcataaggttcatgttcactaa

>Protein - 1353_residues - 173,830_amu
MIHSVFLLMFLLTPTESYVDVGPDSVKSACIEVDIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQGDHGDMYVYSAGHATGTTPQKLFVANYSQDVKQFANGFVVRIGAAAYSTGTVIISPSTSATIRKIYPAFMLGSSVGNFSDGKMGRFFNHTLVLLPDGCGTLLRAFYCILEPRSGNHCPAGNSYTSFATYHTPATDCSDGNYNRNASLNSFKEYFNLRNCTFMYTYNITEDEILEWFGITQTAQGVHLFSSRYVDLYGGNMFQFATLPVYDTIKYYSIIPHSIRSIQSDRKAWAAFYVYKLQPLTFLLDFSVDGYIRRAIDCGFNDLSQLHCSYESFDVESGVYSVSSFEAKPSGSVVEQAEGVECDFSPLLSGTPPQVYNFKRLVFTNCNYNLTKLLSLFSVNDFTCSQISPAAIASNCYSSLILDYFSYPLSMKSDLSVSSAGPISQFNYKQSFSNPTCLILATVPHNLTTITKPLKYSYINKCSRLLSDDRTEVPQLVNANQYSPCVSIVPSTVWEDGDYYRKQLSPLEGGGWLVASGSTVAMTEQLQMGFGITVQYGTDTNSVCPKLEFANDTKIASQLGNCVEYSLYGVSGRGVFQNCTAVGVRQQRFVYDAYQNLVGYYSDDGNYYCLRACVSVPVSVIYDKETKTHATLFGSVACEHISSTMSQYSRSTRSMLKRRDSTYGPLQTPVGCVLGLVNSSLFVEDCKLPLGQSLCALPDTPSTLTPRSVRSVPGEMRLASIAFNHPIQVDQLNSSYFKLSIPTNFSFGVTQEYIQTTIQKVTVDCKQYVCNGFQKCEQLLREYGQFCSKINQALHGANLRQDDSVRNLFASVKSSQSSPIIPGFGGDFNLTLLEPVSISTGSRSARSAIEDLLFDKVTIADPGYMQGYDDCMHQGPASARDLICAQYVAGYKVLPPLMDVNMEAAYTSSLLGSIAGVGWT

In [88]:
from pprint import pprint
import difflib
import re

pprint(cd.genes)

reported = """MIHSVFLLMFLLTPTESYVDVGPDSVKSACIEVDIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQGDHGDMYVYSAGHATGTTPQKLFVANYSQDVKQFANGFVVRIGAAAYSTGTVIISPSTSATIRKIYPAFMLGSSVGNFSDGKMGRFFNHTLVLLPDGCGTLLRAFYCILEPRSGNHCPAGNSYTSFATYHTPATDCSDGNYNRNASLNSFKEYFNLRNCTFMYTYNITEDEILEWFGITQTAQGVHLFSSRYVDLYGGNMFQFATLPVYDTIKYYSIIPHSIRSIQSDRKAWAAFYVYKLQPLTFLLDFSVDGYIRRAIDCGFNDLSQLHCSYESFDVESGVYSVSSFEAKPSGSVVEQAEGVECDFSPLLSGTPPQVYNFKRLVFTNCNYNLTKLLSLFSVNDFTCSQISPAAIASNCYSSLILDYFSYPLSMKSDLSVSSAGPISQFNYKQSFSNPTCLILATVPHNLTTITKPLKYSYINKCSRLLSDDRTEVPQLVNANQYSPCVSIVPSTVWEDGDYYRKQLSPLEGGGWLVASGSTVAMTEQLQMGFGITVQYGTDTNSVCPKLEFANDTKIASQLGNCVEYSLYGVSGRGVFQNCTAVGVRQQRFVYDAYQNLVGYYSDDGNYYCLRACVSVPVSVIYDKETKTHATLFGSVACEHISSTMSQYSRSTRSMLKRRDSTYGPLQTPVGCVLGLVNSSLFVEDCKLPLGQSLCALPDTPSTLTPRSVRSVPGEMRLASIAFNHPIQVDQLNSSYFKLSIPTNFSFGVTQEYIQTTIQKVTVDCKQYVCNGFQKCEQLLREYGQFCSKINQALHGANLRQDDSVRNLFASVKSSQSSPIIPGFGGDFNLTLLEPVSISTGSRSARSAIEDLLFDKVTIADPGYMQGYDDCMHQGPASARDLICAQYVAGYKVLPPLMDVNMEAAYTSSLLGSIAGVGWTAGLSSFAAIPFAQSIFYRLNGVGITQQVLSENQKLIANKFNQALGAMQTGFTTTNEAFRKVQDAVNNNAQALSKLASELSNTFGAISASIGDIIQRLDVLEQDAQIDRLINGRLTTLNAFVAQQLVRSESAALSAQLAKDKVNECVKAQSKRSGFCGQGTHIVSFVVNAPNGLYFMHVGYYPSNHIEVVSAYGLCDAANPTNCIAPVNGYFIKTNNTRIVDEWSYTGSSFYSPEPITSLNTKYVAPQVTYQNISTNLPPPLLGNSTGIDFQDELDEFFKNVSTSIPNFGSLTQINTTLLDLTYEMLSLQQVVKALNESYIDLKELGNYTYYNKWPWYIWLGFIAGLVALALCVFFILCCTGCGTNCMGKLKCNRCCDRYEEYDLEPHKVHVH"""

predicted = '''MIHSVFLLMFLLTPTESYVDVGPDSVKSACIEVDIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQGDHGDMYVYSAGHATGTTPQKLFVANYSQDVKQFANGFVVRIGAAAYSTGTVIISPSTSATIRKIYPAFMLGSSVGNFSDGKMGRFFNHTLVLLPDGCGTLLRAFYCILEPRSGNHCPAGNSYTSFATYHTPATDCSDGNYNRNASLNSFKEYFNLRNCTFMYTYNITEDEILEWFGITQTAQGVHLFSSRYVDLYGGNMFQFATLPVYDTIKYYSIIPHSIRSIQSDRKAWAAFYVYKLQPLTFLLDFSVDGYIRRAIDCGFNDLSQLHCSYESFDVESGVYSVSSFEAKPSGSVVEQAEGVECDFSPLLSGTPPQVYNFKRLVFTNCNYNLTKLLSLFSVNDFTCSQISPAAIASNCYSSLILDYFSYPLSMKSDLSVSSAGPISQFNYKQSFSNPTCLILATVPHNLTTITKPLKYSYINKCSRLLSDDRTEVPQLVNANQYSPCVSIVPSTVWEDGDYYRKQLSPLEGGGWLVASGSTVAMTEQLQMGFGITVQYGTDTNSVCPKLEFANDTKIASQLGNCVEYSLYGVSGRGVFQNCTAVGVRQQRFVYDAYQNLVGYYSDDGNYYCLRACVSVPVSVIYDKETKTHATLFGSVACEHISSTMSQYSRSTRSMLKRRDSTYGPLQTPVGCVLGLVNSSLFVEDCKLPLGQSLCALPDTPSTLTPRSVRSVPGEMRLASIAFNHPIQVDQLNSSYFKLSIPTNFSFGVTQEYIQTTIQKVTVDCKQYVCNGFQKCEQLLREYGQFCSKINQALHGANLRQDDSVRNLFASVKSSQSSPIIPGFGGDFNLTLLEPVSISTGSRSARSAIEDLLFDKVTIADPGYMQGYDDCMHQGPASARDLICAQYVAGYKVLPPLMDVNMEAAYTSSLLGSIAGVGWTAGLSSFAAIPFAQSIFYRLNGVGITQQVLSENQKLIANKFNQALGAMQTGFTTTNEAFRKVQDAVNNNAQALSKLASELSNTFGAISASIGDIIQRLDVLEQDAQIDRLINGRLTTLNAFVAQQLVRSESAALSAQLAKDKVNECVKAQSKRSGFCGQGTHIVSFVVNAPNGLYFMHVGYYPSNHIEVVSAYGLCDAANPTNCIAPVNGYFIKTNNTRIVDEWSYTGSSFYSPEPITSLNTKYVAPQVTYQNISTNLPPPLLGNSTGIDFQDELDEFFKNVSTSIPNFGSLTQINTTLLDLTYEMLSLQQVVKALNESYIDLKELGNYTYYNKWPWYIWLGFIAGLVALALCVFFILCCTGCGTNCMGKLKCNRCCDRYEEYDLEPHKVHVH'''

for a,b in [(reported, predicted)]:     
    print('match:', reported == predicted)
    if not reported == predicted:
        for i,s in enumerate(difflib.ndiff(a, b)):
            if s[0]==' ': continue
            elif s[0]=='-':
                print(u'Delete "{}" from position {}'.format(s[-1],i))
            elif s[0]=='+':
                print(u'Add "{}" to position {}'.format(s[-1],i))    

{'atgagagttcaaagaccacccactctcttgttagtgttctcactctctcttttggtcactgcattctcaaaacctctctatgtac': {'codons': ['atg',
                                                                                                      'aga',
                                                                                                      'gtt',
                                                                                                      'caa',
                                                                                                      'aga',
                                                                                                      'cca',
                                                                                                      'ccc',
                                                                                                      'act',
                                                                                                      'ctc',
                   