In [3]:
import random
import collections

In [4]:
#DNA toolkit file

Nucleotides = ["A", "C", "G", "T"]
DNA_reverse_complement = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
DNA_codon = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }


def validateseq(dna_seq):
    """Check the sequence to make sure it is a DNA string"""
    tmpseq=dna_seq.upper()
    for nuc in tmpseq:
        if nuc not in Nucleotides:
            return false
    return tmpseq

def count_nuc_freq(seq):
    """Count the frequency of each nucleotide in a sequence"""
    tmpfreqdict={"A":0, "C":0, "G":0,"T":0}
    for i in seq:
        tmpfreqdict[i] += 1
    return tmpfreqdict
def transcription(seq):
    """Convert DNA to RNA"""
    return seq.replace('T','U')

def reverse_complement(seq):
    """Create reverse complement of the DNA"""
    return ''.join([DNA_reverse_complement[nuc] for nuc in seq])[::-1]
    #mapping=str.maketrans('ATCG','TAGC')
    #return seq.translate(mapping)[::-1]
    

def gc_content(seq):
    """Find GC content of the DNA"""
    return round((seq.count('C')) + seq.count('G')/ len(seq)* 100)

def gc_content_subsec(seq, k=20):
    """Find GC content of subset of the DNA"""
    res =[]
    for i in range(0, len(seq) -k + 1, k):
        subsec = seq[i:i+k]
        res.append(gc_content(subsec))
    return res

def translate(seq, init_pos=0):
    """Translate DNA to protein"""
    return [DNA_codon[seq[i:i+3]] for i in range(init_pos, len(seq) -2, 3)]

def codon_usage(seq, aminoacid):
    """Provides the frequency of each codon encoding a given amino acid in a sequence"""
    tmplist = []
    for i in range(0, len(seq) -2, 3):
        if DNA_codon[seq[i:i+3]]== aminoacid:
            tmplist.append(seq[i:i+3])
        
            freqDict = dict(collections.Counter(tmplist))
            totalweight = sum(freqDict.values())
            for seq in freqDict:
                freqDict[seq] = round(freqDict[seq]/totalweight, 2)
            return freqDict
        
def gen_reading_frames(seq):
    """Generate the six reading frames of a DNA sequence, including reverse complement"""
    frames= []
    frames.append(translate(seq, 0))
    frames.append(translate(seq, 1))
    frames.append(translate(seq, 2))
    frames.append(translate(reverse_complement(seq), 0))
    frames.append(translate(reverse_complement(seq), 1))
    frames.append(translate(reverse_complement(seq), 2))
    return frames

def proteins_from_rf(self, aa_seq):
        """Compute all possible proteins in an aminoacid seq and return a list of possible proteins"""
        current_prot = []
        proteins = []
        for i in aa_seq:
            if i == "_":
                # STOP accumulating amino acids if _ - STOP was found
                if current_prot:
                    for p in current_prot:
                        proteins.append(p)
                    current_prot = []
            else:
                # START accumulating amino acids if M - START was found
                if i == "M":
                    current_prot.append("")
                for i in range(len(current_prot)):
                    current_prot[i] += i
        return proteins


    

In [5]:
rndDNAstring= ''.join(random.choice(Nucleotides) for i in range(20))
print(validateseq(rndDNAstring))
print(count_nuc_freq(rndDNAstring))
print(transcription(rndDNAstring))
print(reverse_complement(rndDNAstring))
print(gc_content(rndDNAstring))
print(gc_content_subsec(rndDNAstring, k=5))
print(translate(rndDNAstring))
print(codon_usage(rndDNAstring, "L"))
for i in gen_reading_frames(rndDNAstring):
    print(i)

AGGAGTAACCGGGTCAAGAG
{'A': 7, 'C': 3, 'G': 8, 'T': 2}
AGGAGUAACCGGGUCAAGAG
CTCTTGACCCGGTTACTCCT
43
[60, 2, 61, 40]
['R', 'S', 'N', 'R', 'V', 'K']
None
['R', 'S', 'N', 'R', 'V', 'K']
['G', 'V', 'T', 'G', 'S', 'R']
['E', '_', 'P', 'G', 'Q', 'E']
['L', 'L', 'T', 'R', 'L', 'L']
['S', '_', 'P', 'G', 'Y', 'S']
['L', 'D', 'P', 'V', 'T', 'P']


In [6]:
def PatternCount(Text, Pattern):
    count = 0
    for i in range(len(Text)-len(Pattern)+1):
        if Text[i:i+len(Pattern)] == Pattern:
            count = count+1
    return count

def PatternFind(Text, Length):
    Pattern = ""
    dict = {}
    pattern_list = []
    for i in range(len(Text)-Length+1):
        Pattern = Text[i:(i+Length)]
        count = PatternCount(Text, Pattern)
        dict[Pattern] = count
    max_count = 0
    for key in dict:
        if dict.get(key) >= max_count:
            max_count = dict.get(key)
    for key in dict:
        if dict.get(key) == max_count:
            pattern_list.append(key)
    return pattern_list, max_count

ori = "TAAACGTGAGAGAAACGTGCTGATTACACTTGTTCGTGTGGTAT"
most_patterns, count = PatternFind(ori, 3)

print(f"Pattern {most_patterns} is/are the most frequent Pattern in the given sequence, appeared {count} times!")

Pattern ['GTG'] is/are the most frequent Pattern in the given sequence, appeared 4 times!
