In [71]:
'''Write a Python program that takes as input a file containing DNA sequences in multi-FASTA format, and 
computes the answers to the following questions. You can choose to write one program with multiple 
functions to answer these questions, or you can write several programs to address them. We will provide a 
multi-FASTA file for you, and you will run your program to answer the exam questions.'''

'Write a Python program that takes as input a file containing DNA sequences in multi-FASTA format, and \ncomputes the answers to the following questions. You can choose to write one program with multiple \nfunctions to answer these questions, or you can write several programs to address them. We will provide a \nmulti-FASTA file for you, and you will run your program to answer the exam questions.'

In [1]:
'''(1) How many records are in the file? A record in a FASTA file is defined as a single-line header, 
followed by lines of sequence data. The header line is distinguished from the sequence data by a 
greater-than (">") symbol in the first column. The word following the ">" symbol is the identifier of 
the sequence, and the rest of the line is an optional description of the entry. There should be no space 
between the ">" and the first letter of the identifier.'''

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

exseqs = (SeqIO.index('dna.example.fasta', 'fasta'))

def dict_seqs(seqs):
    '''
    Indexing fasta sequences into seqrecords and writing id and sequence into a dictionary
    '''
    seq_dict = {}
    for key, SeqRecord in seqs.items():
        seq_dict[key] = str(SeqRecord.seq)
        #print(key,SeqRecord, '\n')
    #print(len(seq_dict))
    return seq_dict

ex = dict_seqs(exseqs)


In [3]:
'''(2) What are the lengths of the sequences in the file? What is the longest sequence and what is the 
shortest sequence? Is there more than one longest or shortest sequence? What are their identifiers?'''

def sort_seqlens(seq_dict):
    '''
    Creates sorted list of tuples of seq id and sequence lengths from dictionary
    '''    
    seqlen_tuple = sorted((len(v),k) for (k,v) in seq_dict.items()) #list comprehension of tuple (k = id,v = length)
    return seqlen_tuple


In [4]:
def len_minmax(seqs_tuple):
    '''
    Returns max and min lengths of list of tuples
    '''
    longest = seqs_tuple[-1][0]
    shortest = seqs_tuple[0][0]
    return ('Shortest: ', shortest, 'Longest: ', longest)


In [5]:
def seq_dups(seq_tuples):
    '''
    Prints sequences with the same lengths from a list of tuples with id and sequence lengths
    '''
    #print(seq_tuples)
    len_dup = []
    for i in seq_tuples:  #checks for duplication of lengths i[0]
        if i[0] not in len_dup:
            len = i[0]
            len_dup.append(len)
        else:
            print('Same length, Id:', i[1], 'length:', i[0])
            continue
        #print(len_dup)


In [68]:
'''(3) In molecular biology, a reading frame is a way of dividing the DNA sequence of nucleotides into a set 
of consecutive, non-overlapping triplets (or codons). Depending on where we start, there are six possible 
reading frames: three in the forward (5' to 3') direction and three in the reverse (3' to 5'). 
For instance, the three possible forward reading frames for the sequence AGGTGACACCGCAAGCCTTATATTAGC are:

AGG TGA CAC CGC AAG CCT TAT ATT AGC

A GGT GAC ACC GCA AGC CTT ATA TTA GC

AG GTG ACA CCG CAA GCC TTA TAT TAG C

These are called reading frames 1, 2, and 3 respectively. An open reading frame (ORF) is the part of a 
reading frame that has the potential to encode a protein. It starts with a start codon (ATG), and ends 
with a stop codon (TAA, TAG or TGA). For instance, ATGAAATAG is an ORF of length 9.


Given an input reading frame on the forward strand (1, 2, or 3) your program should be able to identify 
all ORFs present in each sequence of the FASTA file, and answer the following questions: what is the 
length of the longest ORF in the file? What is the identifier of the sequence containing the longest ORF? 
For a given sequence identifier, what is the longest ORF contained in the sequence represented by that 
identifier? What is the starting position of the longest ORF in the sequence that contains it? 
The position should indicate the character number in the sequence. For instance, the following ORF in 
reading frame 1:

>sequence1

ATGCCCTAG

starts at position 1.

Note that because the following sequence:

>sequence2

ATGAAAAAA

does not have any stop codon in reading frame 1, we do not consider it to be an ORF in reading frame 1.'''


#Objective: to identify all ORFs present in each sequence of the FASTA file

#what is the length of the longest ORF in the file?
#What is the identifier of the sequence containing the longest ORF?


#For a given sequence identifier, what is the longest ORF contained in the sequence represented by that identifier?
#What is the starting position of the longest ORF in the sequence that contains it?

"(3) In molecular biology, a reading frame is a way of dividing the DNA sequence of nucleotides into a set \nof consecutive, non-overlapping triplets (or codons). Depending on where we start, there are six possible \nreading frames: three in the forward (5' to 3') direction and three in the reverse (3' to 5'). \nFor instance, the three possible forward reading frames for the sequence AGGTGACACCGCAAGCCTTATATTAGC are:\n\nAGG TGA CAC CGC AAG CCT TAT ATT AGC\n\nA GGT GAC ACC GCA AGC CTT ATA TTA GC\n\nAG GTG ACA CCG CAA GCC TTA TAT TAG C\n\nThese are called reading frames 1, 2, and 3 respectively. An open reading frame (ORF) is the part of a \nreading frame that has the potential to encode a protein. It starts with a start codon (ATG), and ends \nwith a stop codon (TAA, TAG or TGA). For instance, ATGAAATAG is an ORF of length 9.\n\n\nGiven an input reading frame on the forward strand (1, 2, or 3) your program should be able to identify \nall ORFs present in each sequence of the FASTA file, 

In [80]:
def rev(seq):
    '''
    Reverses sequence
    '''
    seq_rev = seq[::-1]
    return seq_rev

def complement(seq):  
    ''''Uses dictionary for complementation partners simplified (only lowercase)'''
    seq = seq.upper()
    base_comp = {'A':'T','T':'A','C':'G','G':'C','N':'N'}
    complement_seq = ''
    for n in seq:
        for key,value in base_comp.items():
            if n == key:
                complement_seq += value
    return complement_seq

def rev_complement(seq):
    seq = rev(seq)
    seq = complement(seq)
    return seq

def frame_seqs(seq_dict):
    '''
    Creates a list of each frame (all 6) of sequences in order: 1_F, 1_R, etc
    '''
    frames_dict = {}
    for key,seq in seq_dict.items():
        print(key)
        revcomp_seq = rev_complement(seq_dict[key])
        #print(revcomp_seq)
        frames = []
        for j in range(0, 3):
            frame_f = []
            frame_r = []
            for n in range(j,len(seq_dict[key]),3):
                codon = seq_dict[key][n:n+3]
                frame_f.append(codon)
            frames.append(frame_f)
            for n in range(j,len(revcomp_seq),3):
                codon = revcomp_seq[n:n+3]
                #print(codon)
                frame_r.append(codon)
            frames.append(frame_r)
        frames_dict[key] = frames
        continue 
            
    #print(frames_dict)       
    return frames_dict


'''ex_dict = {
    'a':'ATGCTGTAG',
    'b':'AAAATGCCTTAACCCCCGTAA'   
}'''
#print(ex_dict)

framed_dict = frame_seqs(ex_dict)

for key,value in framed_dict.items():
    print(key,value)

a
b
a [['ATG', 'CTG', 'TAG'], ['CTA', 'CAG', 'CAT'], ['TGC', 'TGT', 'AG'], ['TAC', 'AGC', 'AT'], ['GCT', 'GTA', 'G'], ['ACA', 'GCA', 'T']]
b [['AAA', 'ATG', 'CCT', 'TAA', 'CCC', 'CCG', 'TAA'], ['TTA', 'CGG', 'GGG', 'TTA', 'AGG', 'CAT', 'TTT'], ['AAA', 'TGC', 'CTT', 'AAC', 'CCC', 'CGT', 'AA'], ['TAC', 'GGG', 'GGT', 'TAA', 'GGC', 'ATT', 'TT'], ['AAT', 'GCC', 'TTA', 'ACC', 'CCC', 'GTA', 'A'], ['ACG', 'GGG', 'GTT', 'AAG', 'GCA', 'TTT', 'T']]


In [79]:
def find_start(list):
    '''
    Function returns index if a codon is id'd as start position in a list and returns a list of start indeces
    '''
    start = 'ATG'
    start_list = []
    for index,i in enumerate(list):  #order matters, index comes first then value of each item in list
        if i == start:  #if codon is start codon
            start_pos = index  #get starting index 
            start_list.append(start_pos)
        else:
            continue
    return start_list
    

def find_stop(list):
    '''
    Function finds each stop position in a list of string seq and returns a list of stop indeces
    '''
    stop = ['TAG', 'TGA', 'TAA']
    stop_list = []
    for index,i in enumerate(list):
        for codon in stop:
            if i == codon:  #for each stop codon in list
                stop_pos = index + 1  #get stop index (+1 to include the stop codon)
                #print('stop found', i)
                stop_list.append(stop_pos)
                #print(stop_list)
            else:
                continue
    return stop_list
            



def seq_orfs(frames_dict):
    
    '''
    This function finds ORFs in each frame and adds them to a dictionary: key = frame, value = orf list
    '''

    for key, value in frames_dict.items():
        #print(key,value)
        frames = value  #assigns all frames for a sequence to variable frames
        orf_dict = {}
        
        for index,frame in enumerate(frames):  #for each of the 6 frames, create frame label
            
            if index == 0:
                f = '1_F'
            if index == 1:
                f = '1_R'
            if index == 2:
                f = '2_F'
            if index == 3:
                f = '2_R'
            if index == 4:
                f = '3_F'
            if index == 5:
                f = '3_R'
              
            start = find_start(frame)
            stop = find_stop(frame)
            #print(start,stop)
            orf_dict[f] = []
            
            for i in start:
                orfs = []
                i = int(i)
                for j in stop:
                    j = int(j)
                    seq = ''.join(frame[i:j])
                    orfs.append(seq)
                #print(orfs)
                orf_dict[f] = orfs
        
        print(key, orf_dict)
    
            
        
                

        
           
                
        
            
    

seq_orfs(framed_dict)

a {'1_F': ['ATGCTGTAG'], '1_R': [], '2_F': [], '2_R': [], '3_F': [], '3_R': []}
b {'1_F': ['ATGCCTTAA', 'ATGCCTTAACCCCCGTAA'], '1_R': [], '2_F': [], '2_R': [], '3_F': [], '3_R': []}


In [30]:
def longest_orf(orf_dict):
    '''
    prints the longest orf of each sequence
    '''
    lens = []
    for key,value in orf_dict.items():
        for orf in value:
            lens.append(len(orf))
            sorted_lens = sorted(lens)
            #print(sorted_lens)
        longest_orf = sorted_lens[-1]
        return 'Longest orf for', key, 'is', longest_orf

exdict = {'1_F': ['ATGCCCTAG', 'ATGCCCTAGATGCCCTAGAAACGATAA'],
 '1_R': [],
 '2_F': [],
 '2_R': [],
 '3_F': [],
 '3_R': []}

longest_orf(exdict)

('Longest orf for', '1_F', 'is', 27)

In [48]:
def repeats(seqs_dict,n):
    for key,seq in seqs_dict.items():
        #print(seqs_dict[key])
        substrings = []
        repeat_dict = {}
        seq_list_by_n =[]
        
        for i in range(0,len(seq),n):  #default frame 1
            subseq = seq[i:i+n].upper()
            #print(subseq)
            if subseq not in substrings:
                substrings.append(subseq)
            #print(substrings)
            for s in substrings:
                repeat_dict[s] = 0
            #print(repeat_dict)
        
        for i in range(0,len(seq),n):  #default frame 1
            subseq = seq[i:i+n].upper()
            seq_list_by_n.append(subseq)
        
        #print(seq_list_by_n)
        for s in seq_list_by_n:
            repeat_dict[s] += 1
        return repeat_dict            
            


{'GAG': 3,
 'TTT': 1,
 'TGC': 1,
 'GCC': 1,
 'TAG': 1,
 'AAA': 1,
 'CGA': 1,
 'TAA': 1}