In [149]:
inputDNA = ['ATGGAACAAGAATGA',
       'ATGATTTAAATGATCTAAATGATTTAA',
       'CATATGATTATTTAAATCATGATTATTTAGGATATGGATATTTAGATT',
       'ATGATTATGTAA',
       'ATTATGTAA',
       'ATGCGTCGT']

In [150]:
def codonize(DNA):
    return [DNA[idx : idx + 3] for idx in range(0, len(DNA), 3)]

In [151]:
def translate(codons):
    codon_dict = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'STOP', 'TAG':'STOP',
        'TGC':'C', 'TGT':'C', 'TGA':'STOP', 'TGG':'W'}
    AA = []
    for c in codons:
        if len(c) != 3:
            break
        if c not in codon_dict.keys():
            raise Exception(f"codon {c} has no associated amino acid")
        AA.append(codon_dict[c])
    return AA

In [152]:
def process_AA(in_AA):
    AA = in_AA.copy()
    protein_list = []
    n_failures = 0
    n_AA = len(AA)
    while len(AA) > 0:
        if 'M' not in AA:
            n_failures += 1
#             print('No start codon')
            break
        start_idx = AA.index('M')
        
        if 'STOP' not in AA[start_idx:]:
            n_failures += 1
#             print('No remaining stop codons')
            break        
        stop_idx  = AA[start_idx:].index('STOP') + start_idx
        
        
        if start_idx + 1 == stop_idx:
            n_failures += 1
#             print("Empty protein!")
            AA[start_idx:stop_idx+1]=[]
            continue
        protein = AA[start_idx + 1: stop_idx]
        protein_list.append(protein)
        AA[start_idx:stop_idx+1]=[]
        
    return protein_list, n_failures, AA

In [153]:
def process_DNA(DNA):
    
    aberrant = [i for i in DNA if i not in 'CGTA']
    if len(aberrant) > 0:
        raise Exception(f"Provided DNA contains uncognized nucleotides: {aberrant}")
        
    codons = codonize(DNA)
    AA = translate(codons)
    protein_list, n_failures, non_coding = process_AA(AA)
    n_noncoding_nuc = 3 * len(non_coding) + len(DNA) % 3
    return protein_list, n_failures, non_coding, n_noncoding_nuc

In [160]:
from statistics import mean 

def analyze_DNA(DNA):
    protein_list, n_failures, non_coding, n_noncoding_nuc = process_DNA(DNA)
    protein_lengths = [len(i) for i in protein_list]
    if len(protein_lengths) == 0:
        print("There are no proteins encoded in this sequence.")
        print(f"Number of DNA failures: {n_failures}")
        print(f"Number of non-coding nucleotides: {n_noncoding_nuc} ")
        print('\n')
        return 
    print(f"The number of encoded proteins is {len(protein_list)}")
    print(f"The shortest protein is {min(protein_lengths)} amino acid's long.")
    print(f"The longest protein is {max(protein_lengths)} amino acid's long.")
    print(f"The average protein length is {mean(protein_lengths)} amino acid's long.")
    print(f"Number of DNA failures: {n_failures}")
    print(f"Number of non-coding nucleotides: {n_noncoding_nuc} ")
    print('\n')
    return

In [161]:
for DNA in inputDNA:
    analyze_DNA(DNA)

The number of encoded proteins is 1
The shortest protein is 3 amino acid's long.
The longest protein is 3 amino acid's long.
The average protein length is 3 amino acid's long.
Number of DNA failures: 0
Number of non-coding nucleotides: 0 


The number of encoded proteins is 3
The shortest protein is 1 amino acid's long.
The longest protein is 1 amino acid's long.
The average protein length is 1 amino acid's long.
Number of DNA failures: 0
Number of non-coding nucleotides: 0 


The number of encoded proteins is 3
The shortest protein is 2 amino acid's long.
The longest protein is 2 amino acid's long.
The average protein length is 2 amino acid's long.
Number of DNA failures: 1
Number of non-coding nucleotides: 12 


The number of encoded proteins is 1
The shortest protein is 2 amino acid's long.
The longest protein is 2 amino acid's long.
The average protein length is 2 amino acid's long.
Number of DNA failures: 0
Number of non-coding nucleotides: 0 


There are no proteins encoded in th

In [155]:
## FUNCTIONS FOR GENERATING TESTS
import random
import string

def randDNA(length):
    return ''.join(random.choice('CGTA') for _ in range(length))
def randLetters(length):
    return "".join(random.choice(string.ascii_letters) for _ in range(length)).upper()


In [166]:
## TESTS
process_AA(['STOP','M'])
process_AA(['A','A','A'])
process_AA(['M','STOP'])
process_AA(['STOP'])
process_AA(['M'])
process_AA([''])

analyze_DNA(randDNA(100000))

process_DNA(randLetters(5))


The number of encoded proteins is 366
The shortest protein is 1 amino acid's long.
The longest protein is 144 amino acid's long.
The average protein length is 20.994535519125684 amino acid's long.
Number of DNA failures: 16
Number of non-coding nucleotides: 74662 




Exception: Provided DNA contains uncognized nucleotides: ['F', 'P', 'B', 'P']