In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

def get_info(fasta_file):
    '''
    Input: multi-fasta file
    Output: sequence index, id, length, features
    '''
    for index, record in enumerate(SeqIO.parse(fasta_file, 'fasta')):
        print('Index %i, ID = %s, length %i, with %i features' % 
              (index, record.id, len(record.seq), len(record.features)))
        print(record)

get_info('dna2.fasta')
ex_dict = {'a':'ATGCGGTAGGTTTAA', 'b':'CGATGCGGATGCGCGCTATGCCCTAA'}

Index 0, ID = gi|142022655|gb|EQ086233.1|91, length 4635, with 0 features
ID: gi|142022655|gb|EQ086233.1|91
Name: gi|142022655|gb|EQ086233.1|91
Description: gi|142022655|gb|EQ086233.1|91 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
Number of features: 0
Seq('CTCGCGTTGCAGGCCGGCGTGTCGCGCAACGACGTGTGGGGCCTGACGGGCAGG...GCC', SingleLetterAlphabet())
Index 1, ID = gi|142022655|gb|EQ086233.1|304, length 1151, with 0 features
ID: gi|142022655|gb|EQ086233.1|304
Name: gi|142022655|gb|EQ086233.1|304
Description: gi|142022655|gb|EQ086233.1|304 marine metagenome JCVI_SCAF_1096627390048 genomic scaffold, whole genome shotgun sequence
Number of features: 0
Seq('CGCGATCCAGTAGCGCTTGTAGCCGAGCGCTTCGGCACGCTTCGCGAGCGCGAT...GCC', SingleLetterAlphabet())
Index 2, ID = gi|142022655|gb|EQ086233.1|255, length 4894, with 0 features
ID: gi|142022655|gb|EQ086233.1|255
Name: gi|142022655|gb|EQ086233.1|255
Description: gi|142022655|gb|EQ086233.1|255 marine metagenome JCVI_

In [2]:
def dict_seqs(fasta_file):
    '''
    Input: Fasta file 
    Output: Dictionary: key = sequence id, value = sequence from seqrecord
    '''
    seq_dict = {}
    for index, record in enumerate(SeqIO.parse(fasta_file, 'fasta')):
        seq_dict[record.id] = str(record.seq)
    return seq_dict

dictionary = dict_seqs('dna2.fasta')
print(len(dictionary))

18


In [3]:
def sort_seqlens(seq_dict):
    '''
    Input: Dictionary, key = sequence id, value = sequence from seqrecord
    Output: Returns sorted list of tuples of seq id and sequence lengths from dictionary
    '''    
    seqlen_tuple = sorted((len(v),k) for (k,v) in seq_dict.items()) #list comprehension of tuple (k = id,v = length)
    return seqlen_tuple

s = sort_seqlens(dictionary)

In [4]:
def len_minmax(seqs_tuple):
    '''
    Input: sorted list of tuples
    Output: Returns max and min lengths of list of tuples
    '''
    longest = seqs_tuple[-1][0]
    shortest = seqs_tuple[0][0]
    return ('Shortest: ', shortest, 'Longest: ', longest)

len_minmax(s)

('Shortest: ', 115, 'Longest: ', 4894)

In [5]:
def seq_length_dups(seq_tuples):
    '''
    Input: sorted list of tuples
    Output: Prints sequences with the same lengths from a list of tuples with id and sequence lengths
    '''
    #print(seq_tuples)
    len_dup = []
    count = 0  
    for i in seq_tuples:  #checks for duplication of lengths i[0]
        print(i)
        count += 1
        if i[0] not in len_dup:
            len_dup.append(i[0])
            print('None', 'count',count)
        
        else:
            print('Same length, Id:', i[1], 'length:', i[0], 'count',count)

seq_length_dups(s)

(115, 'gi|142022655|gb|EQ086233.1|346')
None count 1
(442, 'gi|142022655|gb|EQ086233.1|322')
None count 2
(890, 'gi|142022655|gb|EQ086233.1|88')
None count 3
(964, 'gi|142022655|gb|EQ086233.1|584')
None count 4
(967, 'gi|142022655|gb|EQ086233.1|594')
None count 5
(1151, 'gi|142022655|gb|EQ086233.1|304')
None count 6
(1352, 'gi|142022655|gb|EQ086233.1|75')
None count 7
(1432, 'gi|142022655|gb|EQ086233.1|277')
None count 8
(2095, 'gi|142022655|gb|EQ086233.1|4')
None count 9
(2646, 'gi|142022655|gb|EQ086233.1|527')
None count 10
(2867, 'gi|142022655|gb|EQ086233.1|250')
None count 11
(3511, 'gi|142022655|gb|EQ086233.1|45')
None count 12
(4076, 'gi|142022655|gb|EQ086233.1|396')
None count 13
(4338, 'gi|142022655|gb|EQ086233.1|293')
None count 14
(4564, 'gi|142022655|gb|EQ086233.1|454')
None count 15
(4635, 'gi|142022655|gb|EQ086233.1|91')
None count 16
(4804, 'gi|142022655|gb|EQ086233.1|16')
None count 17
(4894, 'gi|142022655|gb|EQ086233.1|255')
None count 18


In [6]:
def rev(seq):
    '''
    Input: Sequence string
    Output: Reverses sequence string
    '''
    seq_rev = seq[::-1]
    return seq_rev

def complement(seq):  
    '''
    Input: Sequence string
    Output: Uses dictionary for complementary bases
    '''
    seq = seq.upper()
    base_comp = {'A':'T','T':'A','C':'G','G':'C','N':'N'}
    complement_seq = ''
    for n in seq:
        for key,value in base_comp.items():
            if n == key:
                complement_seq += value
    return complement_seq

def rev_complement(seq):
    '''
    Input: sequence string
    Output: reverse complement of the input sequence
    '''
    seq = rev(seq)
    seq = complement(seq)
    return seq

In [7]:
def frame_dict(seq_dict):
    '''
    Input: dictionary of sequences
    Output: Returns a dictionary of all 6 frames (1_F,1_R,2_F,2_R,3_F,3_R) of each sequence in a list
    '''
    frames_dict = {}
    
    for key,seq in seq_dict.items():
        #print(key)
        revcomp_seq = rev_complement(seq_dict[key])  #reverse complement sequence
        #print(revcomp_seq)
        frames = []
        
        for j in range(0, 3):  #for 0,1,2 indeces: corresponds to frames
            frame_f = []
            frame_r = []
            for n in range(j,len(seq_dict[key]),3):  #create codon for each frame
                codon = seq_dict[key][n:n+3]
                frame_f.append(codon)  #add codon to forward frame
            
            frames.append(frame_f)  #append each forward frame to list of frames
            
            for n in range(j,len(revcomp_seq),3):  #do the same but with rev comp
                codon = revcomp_seq[n:n+3]
                #print(codon)
                frame_r.append(codon)
            
            frames.append(frame_r)
        
        frames_dict[key] = frames
        continue
        
    return frames_dict

a [['ATG', 'CGG', 'TAG', 'GTT', 'TAA'], ['TTA', 'AAC', 'CTA', 'CCG', 'CAT'], ['TGC', 'GGT', 'AGG', 'TTT', 'AA'], ['TAA', 'ACC', 'TAC', 'CGC', 'AT'], ['GCG', 'GTA', 'GGT', 'TTA', 'A'], ['AAA', 'CCT', 'ACC', 'GCA', 'T']]

b [['CGA', 'TGC', 'GGA', 'TGC', 'GCG', 'CTA', 'TGC', 'CCT', 'AA'], ['TTA', 'GGG', 'CAT', 'AGC', 'GCG', 'CAT', 'CCG', 'CAT', 'CG'], ['GAT', 'GCG', 'GAT', 'GCG', 'CGC', 'TAT', 'GCC', 'CTA', 'A'], ['TAG', 'GGC', 'ATA', 'GCG', 'CGC', 'ATC', 'CGC', 'ATC', 'G'], ['ATG', 'CGG', 'ATG', 'CGC', 'GCT', 'ATG', 'CCC', 'TAA'], ['AGG', 'GCA', 'TAG', 'CGC', 'GCA', 'TCC', 'GCA', 'TCG']]



In [8]:
def find_repeats(seq,kmer = 1): #default repeat length = 1
    
    '''
    Input: string sequence; kmer = k-mer length
    Output: dictionary key = repeat sequence of a certain kmer, value = repeat count
    '''
#iterate thru sequence, create 1) substrings list, 2) repeat count dictionary, and 3) a sequence list parsed by n for identifying repeats
    
    #print(seq)
    substrings = []
    subseq_count = {}  
    seq_list_by_n =[]
    
    #create subsequences and add each unique one to substring list
    for i in range(0,len(seq),kmer):
        subseq = seq[i:i+kmer]
        #print(subseq)
        
        if len(subseq) == kmer:
            seq_list_by_n.append(subseq)  #list for counting subseqs
            
            if subseq not in substrings:  #list for identifying unique subseqs
                substrings.append(subseq)
            
            else:
                pass

            #print(substrings)
            
            for s in substrings:  #add subseq to subseq_count dict as key
                subseq_count[s] = 0  # start count at 0

            for s in seq_list_by_n:  #count each repeat of subseq
                subseq_count[s] += 1
            
        else: 
            pass

    return subseq_count

In [59]:
def repeat_freq(repeat_dict):  #default length of k-mer repeat = 1
    
    '''
    Input: repeat dictionary
    Output: list of tuples (repeat sequence, frequency)
    '''
    
    repeat_list = []
    total = 0
    freq = []  #list of frequencies of each repeat
    kmer = []   
    
    for k,v in repeat_dict.items():  #have to iterate thru dictionary
        #print(k)
        kmer.append(len(k))
        #print(kmer)
        total += v
    
    for k,v in repeat_dict.items():
        f = round((v/total),3)
        #print(f)
        freq.append(f)
    
    #print(freq)
    s_freq = sorted(freq)
    #print(s_freq)
    print('Highest freq ' + str(kmer[1]) +'-mer ' + 'is ' + k + ' ' + str(s_freq[-1]))
    return s_freq

In [10]:
def find_start(l):
    
    '''
    Input: takes in a list of framed codons
    Output: Returns index if a codon is id'd as start position in a list and returns a list of start indeces
    '''
    
    start = 'ATG'
    start_list = []
    for index,i in enumerate(l):  #order matters, index comes first then value of each item in list
        if i == start:  #if codon is start codon
            start_pos = index  #get starting index 
            start_list.append(start_pos)
        else:
            continue
    return start_list
    
def find_stop(l):
    '''
    Input: a list of framed codons
    Output: Function finds each stop position and returns a list of stop indeces
    '''
    stop = ['TAG', 'TGA', 'TAA']
    stop_list = []
    for index,i in enumerate(l):
        #print(i)
        for codon in stop:
            if i == codon:  #for each stop codon in list
                #print('y')
                stop_pos = index + 1  #get stop index (+1 to include the stop codon)
                #print('stop found', i)
                stop_list.append(stop_pos)
            else:
                continue
    #print(stop_list)
    return stop_list

gi|142022655|gb|EQ086233.1|91
['CTC', 'GCG', 'TTG', 'CAG', 'GCC', 'GGC', 'GTG', 'TCG', 'CGC', 'AAC', 'GAC', 'GTG', 'TGG', 'GGC', 'CTG', 'ACG', 'GGC', 'AGG', 'GAG', 'GAT', 'CTC', 'GGC', 'GGC', 'GCC', 'AAC', 'TAT', 'GCG', 'GTC', 'TTT', 'CGG', 'CTC', 'GAA', 'AGC', 'CAG', 'TTC', 'CAG', 'ACC', 'TCC', 'GAC', 'GGC', 'GCG', 'CTG', 'ACC', 'GTG', 'CCC', 'GGC', 'TCC', 'GCA', 'TTC', 'AGT', 'TCG', 'CAA', 'GCC', 'TAC', 'GTC', 'GGG', 'CTC', 'GGC', 'GGC', 'GAC', 'TGG', 'GGG', 'ACC', 'GTG', 'ACG', 'CTC', 'GGG', 'CGC', 'CAG', 'TTC', 'GAT', 'TTC', 'GTC', 'GGC', 'GAT', 'CTG', 'ATG', 'CCG', 'GCT', 'TTC', 'GCG', 'ATC', 'GGC', 'GCG', 'AAC', 'ACG', 'CCG', 'GCC', 'GGC', 'CTG', 'CTC', 'GCG', 'TGG', 'GGC', 'TTG', 'CCG', 'GCG', 'AAT', 'GCG', 'TCG', 'GCG', 'GGC', 'GGT', 'GCG', 'CTC', 'GAC', 'AAC', 'CGC', 'GTG', 'TGG', 'GGC', 'GTC', 'CAG', 'GTG', 'AAC', 'AAT', 'GCG', 'GTG', 'AAG', 'TAC', 'GTG', 'AGC', 'CCG', 'ACG', 'TTC', 'GGC', 'GGA', 'TTG', 'TCG', 'TTC', 'GGC', 'GGC', 'CTG', 'TGG', 'GGC', 'TTC', 'GGC', 'AAC', 'GT

In [16]:
def orfs(framed_dict):
    
    '''
    Input: Dictionary: key = frame, value = list of codons corresponding to frame
    Output: Returns a nested dictionary, key0 = sequence id, value0 = dictionary, key1 = frame, value1 = sorted ([start,stop], orf, orf length)
    '''
    
    seqs = {}  #create dictionary to hold orfs of all frames per sequence
    for key, value in framed_dict.items():
        #print(key)
        seqs[key] = []
        orf_dict_list = []
        #print(key,value)
        frames = value  #assigns all frames for a sequence to variable frames
        orf_dict = {}
        for index,frame in enumerate(frames):  #for each of the 6 frames, create frame label
            #print(orf_dict)
            start = find_start(frame)  #find start indeces
            stop = find_stop(frame)  #find stop indeces
            
            if index == 0:
                f = '1_F'
            if index == 1:
                f = '1_R'
            if index == 2:
                f = '2_F'
            if index == 3:
                f = '2_R'
            if index == 4:
                f = '3_F'
            if index == 5:
                f = '3_R'
            
            orf_dict[f] = []
            #print(f,'start',start,'stop',stop)
            orfs = []
            orf_l = []
            indeces = []  #index list
            #print(indeces)
            orf_info = []
            
            
            for i in start:
                for j in stop:
                    #print(i,j)
                    if j > i:
                        index = [i,j]
                        #print(index)
                        indeces.append(index)  #append indeces to list
                        seq = frame[i:j]
                        #print(f)
                        orf = ''.join(seq)
                        orf_l.append(len(orf))  #append orf length to orf_l list
                        orfs.append(orf)  #append orf to orfs list
                        #print(indeces,orfs,orf_l)        
                    else:
                        pass
            
            zipped = list(zip(indeces,orfs,orf_l))
            #print(zipped)
            orf_dict[f].append(sorted(zipped, key = lambda tup : tup[2]))   
            #print(orf_dict)    
            
            seqs[key] = orf_dict
        
    return seqs  

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
def dict_to_list(sorteddict_):
    '''
    Input: dictionary, key = sequence id, value = sorted list of tuples ('t') (id, frame, orf length)
    Output: list of sorted lists of tuples
    '''
    orflist = []
    for k,v in sorteddict_:
        l.append(v)
    return orflist
        
    

In [None]:
def orf_len_by_frame(orflist):
    '''
    Input: list of tuples (id,frame,orf length)
    Output: a list of tuples subset by frame
    '''
    subset_l = []
    frame = input('What frame do you want? (1_F, 1_R, 2_F, 2_R, 3_F, 3_R)')
    for i in orflist:
        if orflist[1] == frame:
            subset_l.append(i)
        else:
            pass
        
    return subset_l

In [None]:
def orf_len_by_id(orflist):
    '''
    Input: list of list of tuples (id,frame,orf length)
    Output: a list of tuples subset by frame
    '''
    subset_l = []
    id_ = input('What id do you want?')
    for i in orflist:
        if orflist[0] == id_:
            subset_l.append(i)
        else:
            pass
        
    return subset_l


In [69]:
def summary(fasta):
    dictionary = dict_seqs(fasta)
    lengths = sort_seqlens(dictionary)  #get sequence lengths and sort them into a list
    print('Number of sequences:', len(lengths)) 
    print('Length of sequences', lengths) 
    print('Smallest and longest sequences:', len_minmax(lengths)) 
    print('Are there multiple sequences with the same lengths?', seq_dups(lengths))
    print('Are there repeats in the sequences?', repeats(dictionary))

In [None]:
def longest_orf_framedir(nested_orf_dict, frame = 1, direction = 'F'):
    
    '''
    Input: nested dictionary, key = seq id, value = sorted orfs and info by frame; frame = ['1','2','3'] (default = 1); direction = 'F','R' or 'B' (default = 'F')
    Output: longest orf with its info (seq id,[start,stop],orf sequence, length)
    '''
    
    longest_orfs = []
    #print(frame,direction)
    for i in frame:
        #print(i)
        for k,v in b.items():
            #print(k,v)
            for k1,v1 in v.items():
                #print(k1,v1)
                if k1[0] == i and k1[2] == direction:
                    #print(i,direction)
                    #print(k1,v1)
                    for x in v1:
                        if x:
                            #print(x)
                            longest_orfs.append((k,x[-1]))
                elif k1[0] == i and direction == 'B':
                    #print(k1,v1)
                    for y in v1:
                        if y:
                            #print(y)
                            longest_orfs.append((k,y[-1]))

                else:
                    pass

        return longest_orfs

In [37]:
## What is the length of the longest ORF appearing in any sequence and in any forward reading frame?
o = orfs(fd)
lo = longest_orf_framedir(o, frame = ['1','2','3'], direction = 'F')
#print(lo)
longest_orf = sorted(lo, key = lambda tup : tup[1][2])[-1]
print(longest_orf)

('gi|142022655|gb|EQ086233.1|16', ([88, 1596], 'ATGTCGTCAACGTCAGTTCGCGCTATGGCGCGGTGCAGTGGAACGGCCAGCGCATCGCGGGGCTGAACAACATCGAGCTGATCACCGACCGTCCGCTCGACGTGAACCTGAGAGAGATCGCGCAGGTGAAGCGCGACTGGCCGGACCGCGCGCTGATCGTGTCGCTGATGGTGCCGTGCAACGAGCGCGACTGGAAATGGATCCTGCCGCTCGTCGAGGATACGGGCGCCGACGCGGTCGAGCTGAACTTCGGTTGTCCGCACGGGATGAGCGAGCGCGGGATGGGCGCGGCGGTCGGGCAGGTGCCCGAATATGTGGAGATGGTCACGCGCTGGGTGAAGGAAGGCACGAAGCTGCCGTGCCTCGTGAAGCTCACGCCGAACATCAGCGACATCCGGATGGGGTCGCGCGCCGCGTACAAGGGCGGCGCGGACGGCGTGTCGCTGATCAACACGATCAACTCGATCGTCGCGGTCGATCTCGACCATATGGCGCCGATGCCGACGGTCGACGGCAAGGGCACGCACGGCGGCTATTGCGGCCCGGCGGTCAAGCCGATCGCATTGAACATGGTCGCGGAGATCGCACGTGACCCGGAAACGCCGAACCTGCCGATCTCGGGCATCGGCGGCATCTCGTCATGGCGCGACGCGGCGGAGTTCATGGTGCTCGGCGCCGGCAGCGTGCAGGTGTGCACCGCCGCGATGCATTACGGATTCCGGATCGTGTCGGACCTGGCCGACGGATTGTCGAACTGGATGGACGAGAAGGGCTACGCGACGCTCGACGACATTCGCGGCCGCGCGGTGCCGAACGTGACCGACTGGAAATACCTGAACCTGAAATACGACATCAAGGCGCGTATCGACCAGGACCGCTGCATCCAGTGCGGGTTGTGCCATATCGCGTGCGAGGACACGTCGCACCAGGCGATCACCGCGACGAAGGACGG

In [48]:
## What is the length of the longest forward ORF that appears in the sequence with the identifier gi|142022655|gb|EQ086233.1|16?

#print(lo)
_id = []
for t in lo:
    if t[0] == 'gi|142022655|gb|EQ086233.1|16':
        _id.append(t)

#print(_id)
longest_orf = sorted(_id, key = lambda x : x[1][2])[-1] 
print(longest_orf)

#help(sorted)

('gi|142022655|gb|EQ086233.1|16', ([88, 1596], 'ATGTCGTCAACGTCAGTTCGCGCTATGGCGCGGTGCAGTGGAACGGCCAGCGCATCGCGGGGCTGAACAACATCGAGCTGATCACCGACCGTCCGCTCGACGTGAACCTGAGAGAGATCGCGCAGGTGAAGCGCGACTGGCCGGACCGCGCGCTGATCGTGTCGCTGATGGTGCCGTGCAACGAGCGCGACTGGAAATGGATCCTGCCGCTCGTCGAGGATACGGGCGCCGACGCGGTCGAGCTGAACTTCGGTTGTCCGCACGGGATGAGCGAGCGCGGGATGGGCGCGGCGGTCGGGCAGGTGCCCGAATATGTGGAGATGGTCACGCGCTGGGTGAAGGAAGGCACGAAGCTGCCGTGCCTCGTGAAGCTCACGCCGAACATCAGCGACATCCGGATGGGGTCGCGCGCCGCGTACAAGGGCGGCGCGGACGGCGTGTCGCTGATCAACACGATCAACTCGATCGTCGCGGTCGATCTCGACCATATGGCGCCGATGCCGACGGTCGACGGCAAGGGCACGCACGGCGGCTATTGCGGCCCGGCGGTCAAGCCGATCGCATTGAACATGGTCGCGGAGATCGCACGTGACCCGGAAACGCCGAACCTGCCGATCTCGGGCATCGGCGGCATCTCGTCATGGCGCGACGCGGCGGAGTTCATGGTGCTCGGCGCCGGCAGCGTGCAGGTGTGCACCGCCGCGATGCATTACGGATTCCGGATCGTGTCGGACCTGGCCGACGGATTGTCGAACTGGATGGACGAGAAGGGCTACGCGACGCTCGACGACATTCGCGGCCGCGCGGTGCCGAACGTGACCGACTGGAAATACCTGAACCTGAAATACGACATCAAGGCGCGTATCGACCAGGACCGCTGCATCCAGTGCGGGTTGTGCCATATCGCGTGCGAGGACACGTCGCACCAGGCGATCACCGCGACGAAGGACGG

In [67]:
#starting position of the longest ORF in reading frame 3 in any of the sequences? ATG = position 1
lo1 = longest_orf_framedir(o, frame = ['3'], direction = 'F')
longest_orf1 = sorted(lo1, key = lambda tup : tup[1][2])[-1]
print(longest_orf1) 


('gi|142022655|gb|EQ086233.1|16', ([36, 1555], 'ATGGCCGACCTTCGCTGCACCATCGCGGGCATCACTTCGCCGAACCCTTTCTGGCTGGCGTCCGCGCCGCCGACCGACAAGGCCTACAACGTGAACCGCGCGTTCGAGGCGGGCTGGGGCGGGGTCGTCTGGAAGACGCTCGGGCTCGATCCGCATGTCGTCAACGTCAGTTCGCGCTATGGCGCGGTGCAGTGGAACGGCCAGCGCATCGCGGGGCTGAACAACATCGAGCTGATCACCGACCGTCCGCTCGACGTGAACCTGAGAGAGATCGCGCAGGTGAAGCGCGACTGGCCGGACCGCGCGCTGATCGTGTCGCTGATGGTGCCGTGCAACGAGCGCGACTGGAAATGGATCCTGCCGCTCGTCGAGGATACGGGCGCCGACGCGGTCGAGCTGAACTTCGGTTGTCCGCACGGGATGAGCGAGCGCGGGATGGGCGCGGCGGTCGGGCAGGTGCCCGAATATGTGGAGATGGTCACGCGCTGGGTGAAGGAAGGCACGAAGCTGCCGTGCCTCGTGAAGCTCACGCCGAACATCAGCGACATCCGGATGGGGTCGCGCGCCGCGTACAAGGGCGGCGCGGACGGCGTGTCGCTGATCAACACGATCAACTCGATCGTCGCGGTCGATCTCGACCATATGGCGCCGATGCCGACGGTCGACGGCAAGGGCACGCACGGCGGCTATTGCGGCCCGGCGGTCAAGCCGATCGCATTGAACATGGTCGCGGAGATCGCACGTGACCCGGAAACGCCGAACCTGCCGATCTCGGGCATCGGCGGCATCTCGTCATGGCGCGACGCGGCGGAGTTCATGGTGCTCGGCGCCGGCAGCGTGCAGGTGTGCACCGCCGCGATGCATTACGGATTCCGGATCGTGTCGGACCTGGCCGACGGATTGTCGAACTGGATGGACGAGAAGGGCTACGCGACGCTCGACGACATTCGCG

In [60]:
## Find the most frequently occurring repeat of length 6 in all sequences. How many times does it occur in all?

repeat_list = []
for key,seq in fd.items():  #have to iterate thru dictionary 
        print(key)
        for i in seq:
            #print(i)
            sequence = []
            sequence += i
            sequence = ''.join(sequence)
            #print(sequence)
            kmer = 6
            repeats = find_repeats(sequence,kmer)  #remember 10 -> kmer length = 9
            freq = repeat_freq(repeats)
            print(freq[-1])
            repeat_list.append(freq[-1])

print(sorted(repeat_list)[-1])
            

gi|142022655|gb|EQ086233.1|91
Highest freq 6-mer is CCGCTC 0.006
0.006
Highest freq 6-mer is CAACGC 0.01
0.01
Highest freq 6-mer is CGGTGC 0.008
0.008
Highest freq 6-mer is AACGCG 0.012
0.012
Highest freq 6-mer is CGACGC 0.012
0.012
Highest freq 6-mer is CCTGCA 0.008
0.008
gi|142022655|gb|EQ086233.1|304
Highest freq 6-mer is ACTCCT 0.021
0.021
Highest freq 6-mer is ACTGGA 0.01
0.01
Highest freq 6-mer is CTCCTG 0.016
0.016
Highest freq 6-mer is CTGGAT 0.016
0.016
Highest freq 6-mer is TCCTGG 0.01
0.01
Highest freq 6-mer is TGGATC 0.016
0.016
gi|142022655|gb|EQ086233.1|255
Highest freq 6-mer is TATGCG 0.007
0.007
Highest freq 6-mer is GACGTC 0.007
0.007
Highest freq 6-mer is ATGCGT 0.011
0.011
Highest freq 6-mer is CGCGTC 0.009
0.009
Highest freq 6-mer is TGCGTT 0.007
0.007
Highest freq 6-mer is GCGTCG 0.007
0.007
gi|142022655|gb|EQ086233.1|45
Highest freq 6-mer is CTCGCC 0.01
0.01
Highest freq 6-mer is GAGCAC 0.009
0.009
Highest freq 6-mer is CGCTCC 0.009
0.009
Highest freq 6-mer is AGC

In [64]:
## Find all repeats of length 12 in the input file. Let's use Max to specify the number of copies of the most frequent repeat of length 12. How many different 12-base sequences occur Max times?

repeat_list = []
for key,seq in fd.items():  #have to iterate thru dictionary 
        print(key)
        for i in seq:
            #print(i)
            sequence = []
            sequence += i
            sequence = ''.join(sequence)
            #print(sequence)
            kmer = 12
            repeats = find_repeats(sequence,kmer)  #remember 10 -> kmer length = 9
            freq = repeat_freq(repeats)
            #print(freq[-1])
            repeat_list.append(freq[-1])

#print(repeat_list)
print(sorted(repeat_list))

gi|142022655|gb|EQ086233.1|91
Highest freq 12-mer is GCGGTGCGCGGT 0.003
Highest freq 12-mer is GGCCTGCAACGC 0.003
Highest freq 12-mer is CGGTGCGCGGTG 0.003
Highest freq 12-mer is GCCTGCAACGCG 0.003
Highest freq 12-mer is GGTGCGCGGTGC 0.003
Highest freq 12-mer is CCTGCAACGCGA 0.003
gi|142022655|gb|EQ086233.1|304
Highest freq 12-mer is CGCTGGTATAGG 0.011
Highest freq 12-mer is GCTACAAGCGCT 0.011
Highest freq 12-mer is GCTGGTATAGGA 0.011
Highest freq 12-mer is CTACAAGCGCTA 0.011
Highest freq 12-mer is CTGGTATAGGAC 0.011
Highest freq 12-mer is TACAAGCGCTAC 0.011
gi|142022655|gb|EQ086233.1|255
Highest freq 12-mer is GGCATCAACGTC 0.002
Highest freq 12-mer is CCTCGACGCGGA 0.002
Highest freq 12-mer is GCATCAACGTCT 0.002
Highest freq 12-mer is CTCGACGCGGAG 0.002
Highest freq 12-mer is CATCAACGTCTA 0.002
Highest freq 12-mer is TCGACGCGGAGC 0.002
gi|142022655|gb|EQ086233.1|45
Highest freq 12-mer is GCGCTCCGCCTG 0.003
Highest freq 12-mer is GATAGTCGTGCC 0.007
Highest freq 12-mer is CGCTCCGCCTGC 0.

In [65]:
## Which one of the following repeats of length 7 has a maximum number of occurrences?

repeat_list = []
for key,seq in fd.items():  #have to iterate thru dictionary 
        print(key)
        for i in seq:
            #print(i)
            sequence = []
            sequence += i
            sequence = ''.join(sequence)
            #print(sequence)
            kmer = 7
            repeats = find_repeats(sequence,kmer)  #remember 10 -> kmer length = 9
            freq = repeat_freq(repeats)
            #print(freq[-1])
            repeat_list.append(freq[-1])

#print(repeat_list)
print(sorted(repeat_list))

gi|142022655|gb|EQ086233.1|91
Highest freq 7-mer is GCGGTGC 0.008
Highest freq 7-mer is AACGCGA 0.005
Highest freq 7-mer is CGGTGCC 0.005
Highest freq 7-mer is ACGCGAG 0.008
Highest freq 7-mer is GGTGCGC 0.006
Highest freq 7-mer is CCTGCAA 0.005
gi|142022655|gb|EQ086233.1|304
Highest freq 7-mer is CTCCTGG 0.012
Highest freq 7-mer is CTGGATC 0.012
Highest freq 7-mer is TCCTGGG 0.012
Highest freq 7-mer is TGGATCG 0.012
Highest freq 7-mer is CCTGGGC 0.012
Highest freq 7-mer is GGATCGC 0.012
gi|142022655|gb|EQ086233.1|255
Highest freq 7-mer is ACGTCTA 0.006
Highest freq 7-mer is GCGTCGA 0.007
Highest freq 7-mer is GCGTTCG 0.007
Highest freq 7-mer is CGGAGCG 0.006
Highest freq 7-mer is GTCTATG 0.004
Highest freq 7-mer is GGAGCGC 0.006
gi|142022655|gb|EQ086233.1|45
Highest freq 7-mer is CCTGCTC 0.022
Highest freq 7-mer is TGCCGAG 0.02
Highest freq 7-mer is CTGCTCG 0.022
Highest freq 7-mer is GCCGAGC 0.02
Highest freq 7-mer is TGCTCGC 0.02
Highest freq 7-mer is CCGAGCA 0.02
gi|142022655|gb|EQ

In [66]:
#LIST OF FUNCTIONS:

get_info(fasta_file)
summary(fasta)
dict_seqs(fasta_file)
sort_seqlens(seq_dict)
len_minmax(seqs_tuple)
seq_length_dups(seq_tuples)
rev(seq)
complement(seq)
rev_complement(seq)
frame_dict(seq_dict)
find_repeats(seq,kmer = 1)
repeat_freq(repeat_dict)
find_start(l)
find_stop(l)
orfs(framed_dict)
dict_to_list(sorteddict_) -- not used
orf_len_by_frame(orflist) -- not used
orf_len_by_id(orflist)  -- not used
longest_orf_framedir(nested_orf_dict, frame = 1, direction = 'F')

NameError: name 'summary' is not defined

In [None]:
fd = frame_dict(dictionary)

for k,v in fd.items():
    #print(k,v)
    #print()
    
    
for k,v in fd.items():
    print(k)
    for i in v:
        print(i)
        codon_list = i
        find_stop(i)

In [70]:
help(summary())

TypeError: summary() missing 1 required positional argument: 'fasta'