In [1]:
# PLACE YOUR INPUT MINI-CHROMOSOMES IN THIS LIST 
inputDNA = ['ATGGAACAAGAATGA',
       'ATGATTTAAATGATCTAAATGATTTAA',
       'CATATGATTATTTAAATCATGATTATTTAGGATATGGATATTTAGATT',
       'ATGATTATGTAA',
       'ATTATGTAA',
       'ATGCGTCGT']

The analyze_DNA function at the bottom of this page ties all of the functions in this code together. It takes a mini-chromosome as its input and will print out the required output of this code. 

Underneath the production cells, there are multiple testing cells. 

In [2]:
def codonize(DNA): 
# minifunction that takes nucleotide sequence and groups into threes
    return [DNA[idx : idx + 3] for idx in range(0, len(DNA), 3)]

In [3]:
def translate(codons): 
#translates a list of codons into AA sequence
    codon_dict = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'STOP', 'TAG':'STOP',
        'TGC':'C', 'TGT':'C', 'TGA':'STOP', 'TGG':'W'}
    AA = []
    for c in codons:
        if len(c) != 3: #Handles cases where number of nucleotides is not divisable by 3
            break
        if c not in codon_dict.keys():
            raise Exception(f"codon {c} has no associated amino acid")
        AA.append(codon_dict[c])
    return AA

In [4]:
def find_nuc_idx(protein_list): 
# tool to tell you the index of the first nucleotide in your protein so we
# can keep track of which nucleotides are dedicated to what proteins
    position = 0
    for protein in protein_list:
        position += (len(protein) * 3) + 6 #three nucleotides for each AA plus 6 for start and stop
    return position

def process_AA(in_AA):
# Takes a sequence of AA's and pulls out the proteins and logs the DNA failures
# Four major features:
# 1. Extract protein sequence
# 2. Logs DNA failures
# 3. Keeps running ledger of which nucleotides are coding
# 4. Finds fragment proteins (cases where there is an M in the middle of the protein)

    AA = in_AA.copy()
    protein_list = []
    n_failures = 0
    coding_nucleotides = []
    
    if 'M' not in AA and 'STOP' not in AA: # Handle cases where there are neither START or STOP in sequence
        n_failures = 2
        return protein_list, n_failures, coding_nucleotides
    
    
    while len(AA) > 0: #Run until there are no more AA's
        if 'M' not in AA: # no more start codons
            n_failures += 1
            break #Stop
        start_idx = AA.index('M')
        
        if 'STOP' not in AA[start_idx:]: # no more stop codons 
            n_failures += 1
            break #Stop       
        stop_idx  = AA[start_idx:].index('STOP') + start_idx
        
        
        if start_idx + 1 == stop_idx: # adjacent start and stop codons 
            n_failures += 1
            AA[start_idx:stop_idx+1]=[] # remove offending amino acids
            continue
        
        
        position = find_nuc_idx(protein_list) + (6 * n_failures) # let's us know nucleotide position despite the fact we have been removing AA's from our running list. also using failures to account for changes in position due to removed adjacent M+STOP's
        first_nuc = position + start_idx * 3 # nucleotide position of first nucleotide in START codon coding this protein
        last_nuc = position + (stop_idx + 1) * 3 # nucleotide position of first 
        coding_nucleotides.extend(list(range(first_nuc,last_nuc)))
        
        protein = AA[start_idx + 1: stop_idx] #logs protein as AA's between the start and the stop AA
        protein_list.append(protein) #adds protein to running list of proteins
        AA[start_idx:stop_idx+1]=[] #remove the protein from our running list of AA 
    
    # account for cases of M in the middle of a protein which you could count for the start of a new protein
    for protein in protein_list:
        if 'M' in protein:
            fragment = protein[protein.index('M')+1:]
            if len(fragment) == 0: # Indicates adjacent M and STOP amino acid
                n_failures += 1
            else: # add the fragment to protein list
                protein_list.append(fragment)
        
    return protein_list, n_failures, coding_nucleotides

In [5]:
def process_DNA(DNA):
#This function ties together the processing steps:
# 1. "Codonize" the nucleotides 
# 2. Translates codons
# 3. Extracts protein features, n DNA failures, and coding nucleotides from AA sequence
# Additionally this function makes sure all nucleotides are recognizable

    aberrant = [i for i in DNA if i not in 'CGTA'] #finds unrecognized nucleotides 
    if len(aberrant) > 0:
        raise Exception(f"Provided DNA contains uncognized nucleotides: {aberrant}")
    
    codons = codonize(DNA)
    AA = translate(codons)
    protein_list, n_failures, coding_nucleotides = process_AA(AA)
    return protein_list, n_failures, coding_nucleotides

In [6]:
from statistics import mean 

def analyze_DNA(DNA):
    
    protein_list     = []
    n_failures       = 0
    n_noncoding_nuc  = 0
    coding_nuc       = []
    
    for frame in range(3):#run three times, one for each reading frame
        
        proteinlist, nfailures, codingnuc = process_DNA(DNA[frame:]) #begin reading at different frames for each loop
        protein_list.extend(proteinlist) #running list of proteins across all reading frames 
        n_failures += nfailures # running tally of DNA failures 
        coding_nuc.extend([i + frame for i in codingnuc]) #running list of all of the nucleotides which contributed to proteins across each reading frame
        

    
    coding_nucleotides = set(coding_nuc) # all of the unique nucleotides which contributed to protein
    n_noncoding_nuc = len(DNA) - len(coding_nucleotides) # if A:S, then |A'|= |S| - |A| 
    
    protein_lengths = [len(i) for i in protein_list]
    if len(protein_lengths) == 0:
        print("There are no proteins encoded in this sequence.")
        print(f"Number of DNA failures: {n_failures}")
        print(f"Number of non-coding nucleotides: {n_noncoding_nuc} ")
        print('\n')
        return 
    print(f"The number of encoded proteins is {len(protein_list)}")
    print(f"The shortest protein is {min(protein_lengths)} amino acids long.")
    print(f"The longest protein is {max(protein_lengths)} amino acids long.")
    print(f"The average protein length is {int(mean(protein_lengths))} amino acids long.")
    print(f"Number of DNA failures: {n_failures}")
    print(f"Number of non-coding nucleotides: {n_noncoding_nuc} ")
    print('\n')
    return

In [7]:
for DNA in inputDNA:
    codon = codonize(DNA)
    AA = translate(codon)
    print(AA)
    analyze_DNA(DNA)


['M', 'E', 'Q', 'E', 'STOP']
The number of encoded proteins is 1
The shortest protein is 3 amino acids long.
The longest protein is 3 amino acids long.
The average protein length is 3 amino acid's long.
Number of DNA failures: 3
Number of non-coding nucleotides: 0 


['M', 'I', 'STOP', 'M', 'I', 'STOP', 'M', 'I', 'STOP']
The number of encoded proteins is 3
The shortest protein is 1 amino acids long.
The longest protein is 1 amino acids long.
The average protein length is 1 amino acid's long.
Number of DNA failures: 3
Number of non-coding nucleotides: 0 


['H', 'M', 'I', 'I', 'STOP', 'I', 'M', 'I', 'I', 'STOP', 'D', 'M', 'D', 'I', 'STOP', 'I']
The number of encoded proteins is 3
The shortest protein is 2 amino acids long.
The longest protein is 2 amino acids long.
The average protein length is 2 amino acid's long.
Number of DNA failures: 4
Number of non-coding nucleotides: 12 


['M', 'I', 'M', 'STOP']
The number of encoded proteins is 1
The shortest protein is 2 amino acids long.
The 

In [8]:
## FUNCTIONS FOR GENERATING TESTS
import random
import string

def randDNA(length): #Produce random string of nucleotides of desired length
    return ''.join(random.choice('CGTA') for _ in range(length))
def randLetters(length): #Produce random string of letters of desired length
    return "".join(random.choice(string.ascii_letters) for _ in range(length)).upper()


In [9]:
## TESTS
print(process_AA(['STOP','M']))
print(process_AA(['A','A','A']))
print(process_AA(['M','STOP']))
print(process_AA(['STOP']))
print(process_AA(['M']))
print(process_AA(['']))
print(process_AA(['M','M','A','STOP']))
print(process_AA(['M','M','STOP']))
print(process_AA(['M','M','M','STOP']))
print(process_AA(['M','A','M','M','STOP']))
# series of tests on process_AA function to make sure it can handle difficult AA sequences 

([], 1, [])
([], 2, [])
([], 1, [])
([], 1, [])
([], 1, [])
([], 2, [])
([['M', 'A'], ['A']], 0, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
([['M']], 1, [0, 1, 2, 3, 4, 5, 6, 7, 8])
([['M', 'M'], ['M']], 1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
([['A', 'M', 'M'], ['M']], 1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])


In [10]:
analyze_DNA(randDNA(500000)) #Handle arbitrarly long random nucleotide patterns 
# crucial test in finding bugs I was not aware I had!!

The number of encoded proteins is 7386
The shortest protein is 1 amino acids long.
The longest protein is 200 amino acids long.
The average protein length is 21 amino acid's long.
Number of DNA failures: 384
Number of non-coding nucleotides: 195718 




In [11]:
process_DNA(randLetters(5)) #Handle cases where random letters are provided instead of nucleotides 

Exception: Provided DNA contains uncognized nucleotides: ['E', 'Q', 'S', 'M', 'D']