In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

def get_info(fasta_file):
    '''
    Input: multi-fasta file
    Output: 
    '''
    for index, record in enumerate(SeqIO.parse(fasta_file, 'fasta')):
        print('Index %i, ID = %s, length %i, with %i features' % 
              (index, record.id, len(record.seq), len(record.features)))
        print(record)

#get_info('dna2.fasta')
ex_dict = {'a':'ATGCGGTAGGTTTAA', 'b':'CGATGCGGATGCGCGCTATGCCCTAA'}

In [3]:
def dict_seqs(fasta_file):
    '''
    Input: Fasta file 
    Output: Dictionary: key = sequence id, value = sequence from seqrecord
    '''
    seq_dict = {}
    for index, record in enumerate(SeqIO.parse(fasta_file, 'fasta')):
        seq_dict[record.id] = str(record.seq)
    return seq_dict

dictionary = dict_seqs('dna2.fasta')
print(len(dictionary))

18


In [4]:
def sort_seqlens(seq_dict):
    '''
    Input: Dictionary, key = sequence id, value = sequence from seqrecord
    Output: Returns sorted list of tuples of seq id and sequence lengths from dictionary
    '''    
    seqlen_tuple = sorted((len(v),k) for (k,v) in seq_dict.items()) #list comprehension of tuple (k = id,v = length)
    return seqlen_tuple

In [5]:
def len_minmax(seqs_tuple):
    '''
    Input:
    Output: Returns max and min lengths of list of tuples
    '''
    longest = seqs_tuple[-1][0]
    shortest = seqs_tuple[0][0]
    return ('Shortest: ', shortest, 'Longest: ', longest)


In [6]:
def seq_dups(seq_tuples):
    '''
    Input:
    Output: Prints sequences with the same lengths from a list of tuples with id and sequence lengths
    '''
    #print(seq_tuples)
    len_dup = []
    count = 0  
    for i in seq_tuples:  #checks for duplication of lengths i[0]
        print(i)
        count += 1
        if i[0] not in len_dup:
            len_dup.append(i[0])
            print('None', 'count',count)
        
        else:
            print('Same length, Id:', i[1], 'length:', i[0], 'count',count)
            

In [7]:
def rev(seq):
    '''
    Input: Sequence string
    Output: Reverses sequence string
    '''
    seq_rev = seq[::-1]
    return seq_rev

def complement(seq):  
    '''
    Input: Sequence string
    Output: Uses dictionary for complementary bases
    '''
    seq = seq.upper()
    base_comp = {'A':'T','T':'A','C':'G','G':'C','N':'N'}
    complement_seq = ''
    for n in seq:
        for key,value in base_comp.items():
            if n == key:
                complement_seq += value
    return complement_seq

def rev_complement(seq):
    '''
    Input: sequence string
    Output: reverse complement of the input sequence
    '''
    seq = rev(seq)
    seq = complement(seq)
    return seq

In [8]:
def frame_dict(seq_dict):
    '''
    Input: dictionary of sequences
    Output: Returns a dictionary of all 6 frames (1_F,1_R,2_F,2_R,3_F,3_R) of each sequence in a list
    '''
    frames_dict = {}
    for key,seq in seq_dict.items():
        #print(key)
        revcomp_seq = rev_complement(seq_dict[key])
        #print(revcomp_seq)
        frames = []
        for j in range(0, 3):
            frame_f = []
            frame_r = []
            for n in range(j,len(seq_dict[key]),3):
                codon = seq_dict[key][n:n+3]
                frame_f.append(codon)
            frames.append(frame_f)
            for n in range(j,len(revcomp_seq),3):
                codon = revcomp_seq[n:n+3]
                #print(codon)
                frame_r.append(codon)
            frames.append(frame_r)
        frames_dict[key] = frames
        continue
    return frames_dict

fd = frame_dict(ex_dict)

In [9]:
def get_repeats(frame_dict,n = 1,frame = 0): #default frame 1
    '''
    Input: dictionary of framed sequences with their identifiers
    Output: a nested dictionary of the number of repeats of a certain k-mer
    '''
    
    sequence_repeats = {}
    n = input('K-mer length? (default is 1)')
    frame = input('Frame (indeces 0 - 2)? (default is 0: frame 1)')
    
    #iterate thru sequences, create substrings list, repeat count dict, and a sequence list parsed by n    
    for key,seq in frame_dict.items():  
        #print(seqs_dict[key])
        substrings = []
        repeat_dict = {}
        seq_list_by_n =[]
        
        for i in frame:
            for i in range(frame,len(seq),n):  #create subsequences and add each unique one to substring list
                subseq = seq[i:i+n].upper()
                #print(subseq)
                if subseq not in substrings:
                    substrings.append(subseq)
                else:
                    pass

                #print(substrings)
                for s in substrings:  #add substring to repeat count dict as key
                    repeat_dict[s] = 0  # start count at 0

               #print(repeat_dict)
                seq_list_by_n.append(subseq)

            #print(seq_list_by_n)
            for s in seq_list_by_n:
                repeat_dict[s] += 1

            sequence_repeats[key] = repeat_dict
    
    return sequence_repeats 

In [10]:
def repeat_freq(framed_dict, n = 1):  #default length of repeat = 1
    '''
    Input: framed dictionary
    Output: list of tuples (repeat sequence, frequency)
    '''
    a = get_repeats(framed_dict, n)
    total = 0
    freq = []  #list of frequencies of each repeat
    for k,v in a.items():  #a is a nested dictionary key = sequence id, value = dictionary of repeat and number of occurences
        for k_1,v_1 in v.items():  #get total number of repeats
            total += v_1

        for k_2,v_2 in v.items():
            freq.append((k_2,(v_2/total)))

    s_freq = sorted(freq, key = lambda tup : tup[1])
    print('Highest freq repeat of ' + n + ' is' + s_freq[-1])
    return s_freq

In [11]:
def find_start(l):
    '''
    Input: takes in a list of framed codons
    Output: Returns index if a codon is id'd as start position in a list and returns a list of start indeces
    '''
    start = 'ATG'
    start_list = []
    for index,i in enumerate(l):  #order matters, index comes first then value of each item in list
        if i == start:  #if codon is start codon
            start_pos = index  #get starting index 
            start_list.append(start_pos)
        else:
            continue
    return start_list
    
def find_stop(l):
    '''
    Input: a list of framed codons
    Output: Function finds each stop position and returns a list of stop indeces
    '''
    stop = ['TAG', 'TGA', 'TAA']
    stop_list = []
    for index,i in enumerate(l):
        #print(i)
        for codon in stop:
            if i == codon:  #for each stop codon in list
                #print('y')
                stop_pos = index + 1  #get stop index (+1 to include the stop codon)
                #print('stop found', i)
                stop_list.append(stop_pos)
            else:
                continue
    #print(stop_list)
    return stop_list

for k,v in fd.items():
    print(k)
    for i in v:
        print(i)
        codon_list = i
        find_stop(i)

a
['ATG', 'CGG', 'TAG', 'GTT', 'TAA']
['TTA', 'AAC', 'CTA', 'CCG', 'CAT']
['TGC', 'GGT', 'AGG', 'TTT', 'AA']
['TAA', 'ACC', 'TAC', 'CGC', 'AT']
['GCG', 'GTA', 'GGT', 'TTA', 'A']
['AAA', 'CCT', 'ACC', 'GCA', 'T']
b
['CGA', 'TGC', 'GGA', 'TGC', 'GCG', 'CTA', 'TGC', 'CCT', 'AA']
['TTA', 'GGG', 'CAT', 'AGC', 'GCG', 'CAT', 'CCG', 'CAT', 'CG']
['GAT', 'GCG', 'GAT', 'GCG', 'CGC', 'TAT', 'GCC', 'CTA', 'A']
['TAG', 'GGC', 'ATA', 'GCG', 'CGC', 'ATC', 'CGC', 'ATC', 'G']
['ATG', 'CGG', 'ATG', 'CGC', 'GCT', 'ATG', 'CCC', 'TAA']
['AGG', 'GCA', 'TAG', 'CGC', 'GCA', 'TCC', 'GCA', 'TCG']


In [36]:
def orfs(framed_dict):
    '''
    Input: Dictionary: key = frame, value = list of codons corresponding to frame
    Output: Returns a nested dictionary, key = sequence id, value = dictionary, key = frame, value = start,stop,ORFs 
    '''
    seqs = {}  #create dictionary to hold orfs of all frames per sequence
    for key, value in framed_dict.items():
        print(key)
        seqs[key] = []
        orf_dict_list = []
        #print(key,value)
        frames = value  #assigns all frames for a sequence to variable frames
        orf_dict = {}
        for index,frame in enumerate(frames):  #for each of the 6 frames, create frame label
            #print(orf_dict)
            start = find_start(frame)  #find start indeces
            stop = find_stop(frame)  #find stop indeces
            
            if index == 0:
                f = '1_F'
            if index == 1:
                f = '1_R'
            if index == 2:
                f = '2_F'
            if index == 3:
                f = '2_R'
            if index == 4:
                f = '3_F'
            if index == 5:
                f = '3_R'
            
            orf_dict[f] = []
            #print(f,'start',start,'stop',stop)
            orfs = []
            orf_l = []
            indeces = []  #index list
            #print(indeces)
            orf_info = []
        
            for i in start:
                for j in stop:
                    #print(i,j)
                    if j > i:
                        index = [i,j]
                        #print(index)
                        indeces.append(index)  #append indeces to list
                        seq = frame[i:j]
                        #print(f)
                        orf = ''.join(seq)
                        orf_l.append(len(orf))  #append orf length to orf_l list
                        orfs.append(orf)  #append orf to orfs list
                        #print(indeces,orfs,orf_l)
                        zipped = list(zip(indeces,orfs,orf_l))
                        print(zipped)
                        #for k in zipped:
                            #print(k)
                        orf_info.append(zipped)      
                            
                        
                    else:
                        pass
                    
            #print(orf_info)
            #orf_dict[f].append(k)    
            #print(orf_dict)
                #print(orf_dict)           
                #print(orf_dict)
                
        seqs[key] = orf_dict
        #print(seqs)
        
    #for key,value in seqs.items():
        #print(key,value)
     #   for k,v in value.items():
            #print(v)
      #      value[k] = sorted(v, key = lambda tup : tup[3])
            #print(v)
    #return seqs

print(ex_dict)
orfs(fd)  #### problem with b, need the orfs to be in one list 

{'a': 'ATGCGGTAGGTTTAA', 'b': 'CGATGCGGATGCGCGCTATGCCCTAA'}
a
[([0, 3], 'ATGCGGTAG', 9)]
[([0, 3], 'ATGCGGTAG', 9), ([0, 5], 'ATGCGGTAGGTTTAA', 15)]
b
[([0, 8], 'ATGCGGATGCGCGCTATGCCCTAA', 24)]
[([0, 8], 'ATGCGGATGCGCGCTATGCCCTAA', 24), ([2, 8], 'ATGCGCGCTATGCCCTAA', 18)]
[([0, 8], 'ATGCGGATGCGCGCTATGCCCTAA', 24), ([2, 8], 'ATGCGCGCTATGCCCTAA', 18), ([5, 8], 'ATGCCCTAA', 9)]


In [13]:
def sorted_orfs(nested_dict):
    '''
    Input: nested dictionary, key = id, value = (start,stop, orf,length) by frame 
    Output: nested dictionary, key = id, value = list of tuples (id,frame,start,stop,orf length) sorted by orf length
    '''
    s_orf_len = {}  #sorted orf lengths
    for key,value in nested_dict.items():
        #print(key,value)

        for f,o in value.items():  # f = frame, o = start, stop, orf, orf_len
            s = o[0][0]  #start
            st = o[0][1]  #stop
            orf = o[0][2]  #orf
            l = o[0][3]
            
            if l:
                print(o)
                #print('l:',l)
                l_zip = []  #list for each length
                o_zip = []  #list for each orf
                
                for i in l: #for each length in l
                    #print('i:',i)
                    l_zip.append(i)
                    for j in orf:  #for each orf 
                        if len(j) == i:  #if length of orf corresponds to length item 
                            #print('j:',j, 'i:',i)
                            o_zip.append(j)
                        else:
                            pass
                        
                #print(l_zip,o_zip)  
                z = list(zip(o_zip,l_zip))
                print(z)
                
                z_f = list(zip(s,st,z))
                print(z_f)
            
          
            #z_sorted = [a,b,c,d for a,b,c,d in sorted(zip(s,st,o_zip,l_zip))]
            
            
            
        #s_orf_len[key] = (f,zipped)
    #return s_orf_len


sorted_orfs(b)
#print(c)

NameError: name 'b' is not defined

In [None]:
def dict_to_list(sorteddict_):
    '''
    Input: dictionary, key = sequence id, value = sorted list of tuples ('t') (id, frame, orf length)
    Output: list of sorted lists of tuples
    '''
    orflist = []
    for k,v in sorteddict_:
        l.append(v)
    return orflist
        
    

In [None]:
'''def orf_len_by_frame(sorted_orf_dict):
    
    Input: dictionary, key = sequence id, value = sorted list of tuples ('t') (id,frame, orf length), user input: frame to subset list by
    Output: Sorted list of orf lengths in requested frame
    
    l = sorted_orf_dict
    frame_list = []
    f_r = input('What frame do you want? (1_F, 1_R, 2_F, 2_R, 3_F, 3_R)')
    for k,v in l.items():
        #print(k,v)
        for t in v:
            if t[0] == f_r:
                frame_list.append(t[1])
            else:
                pass

    return frame_list'''

def orf_len_by_frame(orflist):
    '''
    Input: list of tuples (id,frame,orf length)
    Output: a list of tuples subset by frame
    '''
    subset_l = []
    frame = input('What frame do you want? (1_F, 1_R, 2_F, 2_R, 3_F, 3_R)')
    for i in orflist:
        if orflist[1] == frame:
            subset_l.append(i)
        else:
            pass
        
    return subset_l

In [None]:
'''def orf_len_by_id(sorted_orf_dict):
    
    Input:
    Output:
    
    l = sorted_orf_dict
    frame_list = []
    f_r = input('What frame do you want? (1_F, 1_R, 2_F, 2_R, 3_F, 3_R)')
    for k,v in l.items():
        #print(k,v)
        for t in v:
            if t[0] == f_r:
                frame_list.append(t[1])
            else:
                pass

    return frame_list'''

def orf_len_by_id(orflist):
    '''
    Input: list of list of tuples (id,frame,orf length)
    Output: a list of tuples subset by frame
    '''
    subset_l = []
    id_ = input('What id do you want?')
    for i in orflist:
        if orflist[0] == id_:
            subset_l.append(i)
        else:
            pass
        
    return subset_l


In [None]:
def longest_orfs(sorted_orf_lens_dict):
    '''
    Input: Dictionary, key = sequence id, value = list of tuples (frame, orf length) sorted by length
    Output: Dictionary, key = sequence id, value = list of tuples (frame, longest orf length) 
    '''
    longest_orfs = {}
    longest_orf = []
    for k,v in sorted_orf_lens_dict:
        longest_orfs[k] = (v[-1])
        longest_orf.append(v[-1])
    
    longest_orf = sorted(longest_orf, key = lambda tup : tup[1])[-1]
    print('The longest orf in the file is', longest_orf)
    
    return longest_orfs


In [None]:
def summary(fasta):
    dictionary = dict_seqs(fasta)
    lengths = sort_seqlens(dictionary)  #get sequence lengths and sort them into a list
    print('Number of sequences:', len(lengths)) 
    print('Length of sequences', lengths) 
    print('Smallest and longest sequences:', len_minmax(lengths)) 
    print('Are there multiple sequences with the same lengths?', seq_dups(lengths))
    print('Are there repeats in the sequences?', repeats(dictionary))

In [None]:
#starting position of the longest ORF in reading frame 3 in any of the sequences? ATG = position 1

b = orfs(fd)
print(b)
c = sorted_orfs(b)
print(c)

'''
for k,v in b.items():
    #print(k)
    orf_length =[]
    for i,j in v.items():
        frame = i
        if frame.find('_F') == True:
            print(j[0])
            #if j[0][3]:
                '''
                
                
                
                    
        #s_orf_l = sorted(orf_length, key = lambda tup : tup[3])
    #print(s_orf_l)





In [None]:
## What is the length of the longest ORF appearing in any sequence and in any forward reading frame?
d = longest_orfs(#INPUT DICTIONARY#)
x = []

for k,v in d.items():
    x.append((k,v))

print(orf_len_by_frame(x))

In [None]:
## What is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|16?

print(orf_len_by_id(x))

In [None]:
## Find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?

print(repeat_freq(##framed_dict##, 6))

In [None]:
## Find all repeats of length 12 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 12. How many different 12-base sequences occur Max times?

mfr = repeat_freq(#framed_dict, 12)
Max = mfr[-1][1]

for i in mfr:
    if i[1] == max:
        print(i[0])

In [None]:
## Which one of the following repeats of length 7 has a maximum number of occurrences?

a = repeat_freq(#framed_dict, 7)
repeat = a[-1][0]
print(repeat)

In [None]:
ex_dict = {'a':'ATGCGGTAGGTTAA', 'b':'CGATGCGGATGCGCGCTTAA'}


print(ex_dict)
#summary(ex_dict)

frames = frame_dict(ex_dict)  #create frames from dictionary
orfs = orfs(frames)  #find orfs in each frame
sorted_orfs = sorted_orfs(orfs)  #get lengths and id of orfs 
#orf_list = dict_to_list(sorted_orfs)
#subset_frame = orf_len_by_frame(orf_list)
#subset_id_frame = orf_len_by_id(subset_frame)

#print(orf_len_frame[-1])  #print longest orf in this frame
    


    




