In [1]:
# Methods from Previous Weeks

def Neighbors(pattern, d):
    bases     = ['A', 'C', 'G', 'T']
    
    if d == 0:
        return (pattern)
    if len(pattern) == 1:
        return (bases)

    neighbors       = []
    suffix_neighbors = Neighbors(pattern[1:], d)
    for text in suffix_neighbors:
        if HammingDistance(pattern[1:], text) < d:
            for base in bases:
                neighbors.append(base + text)
        else :
            neighbors.append(pattern[0] + text)
    return(neighbors)

def HammingDistance(seq1, seq2):
    if len(seq1) != len(seq2):
        raise ValueError("Input sequences must have equal length.")
    
    distance = 0
    for base1, base2 in zip(seq1, seq2):
        if base1 != base2:
            distance += 1

    return distance

In [2]:
# Stepik 1.2 - Exercise Break

# What is the expected number of occurrences of a 9-mer in 500 random DNA strings, 
# each of length 1000? Assume that the sequences are formed by selecting each 
# nucleotide (A, C, G, T) with the same probability (0.25).

# Note: Express your answer as a decimal; allowable error = 0.0001.

string_count = 500
length = 1000
k = 9
nucleotide_probability = .25

total_strings = string_count * (1 + (length - k))

num_of_mismatches = total_strings * (nucleotide_probability ** k)

print("Total Number of Strings =", total_strings)
print("Expected Number of Occurences =", num_of_mismatches)


Total Number of Strings = 496000
Expected Number of Occurences = 1.89208984375


In [4]:
# Stepik 1.2 - Mofif Problem

# Given a collection of strings Dna and an integer d, a k-mer is a (k,d)-motif 
# if it appears in every string from Dna with at most d mismatches. For example, 
# the implanted 15-mer in the strings above represents a (15,4)-motif.

# Implanted Motif Problem: Find all (k, d)-motifs in a collection of strings.

# Input: A collection of strings Dna, and integers k and d.
# Output: All (k, d)-motifs in Dna.

from collections import Counter

def MotifEnumeration(genome, k, d):
    genome = genome.split(" ") 
    patterns     = []
    patterns_all = []
    for text in genome:
        patterns_text = []
        for start in range(len(text) - k + 1):
            end     = start + k
            pattern = list(text[start: end])
            patterns_text.extend(Neighbors(pattern, d)) 
            
        patterns_text = list(set(patterns_text))
        patterns_all.extend(patterns_text)
        
    patterns_count = Counter(patterns_all)
    for pattern, count in patterns_count.items():
        if count == len(genome):
            patterns.append(pattern)
            
    return (patterns)

# Sample Test (output = ATA ATT GTT TTT)
k    = 3
d    = 1
genome ='ATTTGGC TGCCTTA CGGTATC GAAAATT'

print("Sample Implanted Motifs =", " ".join(MotifEnumeration(genome, k, d)))

Sample Implanted Motifs = GTT ATT ATA TTT


In [5]:
# Randomized Problem
k    = 5
d    = 1
genome ='GCATCATCCTGCGTTTGGGGGAAGG TGAGGACTCTGCTACAAGGAGCTTC GCTTCCGAGTAGGGGTTACCGTTAC AGAGTGCTTCTGGCGTCCACAGACG GGCGTCAACATCCTTGACAGGCGTC TGCTCGGGAATTACTAAACAGCGTC'

print("Randomized Implanted Motifs =", " ".join(MotifEnumeration(genome, k, d)))

Randomized Implanted Motifs = AGCGT GCGTC GCATC GCCTC GCTTG GCTTC


In [6]:
# Stepik 1.3 - Scoring Motifs

# The minimium possible value of Score(Motifs) is 0 (if all the k-mers in 
# Motifs are the same). What is the maximum possible value of Score(Motifs) 
# for 10 motifs of length 15?

# Not sure how to program this, but here's my thinking.

motifs_count = 10
length = 15
lower_case_no_color_count = 3

print("Score =", (motifs_count - lower_case_no_color_count) * length)



Score = 105


In [8]:
# Stepik 1.3 - Scoring Motifs Continued

import numpy as np

def ProfileMatrix(motifs):
    if type(motifs) == str:
        strings  = list(motifs)
        n_string = len(strings)
        k        = len(strings[0])
        profile_matrix  = np.array([0] * 4 * k, dtype='f').reshape(4, k)
    
    if type(motifs) == list:
        motifs_array = np.array([list(seq) for seq in motifs])
        profile_matrix = np.zeros((4, motifs_array.shape[1]))

    for i in range(motifs_array.shape[1]):
        column_i = motifs_array[:, i]
        len_column_i = column_i.shape[0]
        count = Counter(column_i)
        profile_matrix[0, i] = count["A"] / len_column_i
        profile_matrix[1, i] = count["C"] / len_column_i
        profile_matrix[2, i] = count["G"] / len_column_i
        profile_matrix[3, i] = count["T"] / len_column_i

    return profile_matrix


def Score(motifs):
    motifs_array = np.array([list(seq) for seq in motifs])
    score = 0

    for i in range(motifs_array.shape[1]):
        column_i = motifs_array[:, i]
        count = Counter(column_i)
        max_freq = count.most_common(1)[0][1]
        score += column_i.shape[0] - max_freq

    return score


motifs = [
    "TCGGGGGTTTTT",
    "CCGGTGACTTAC",
    "ACGGGGATTTTC",
    "TTGGGGACTTTT",
    "AAGGGGACTTCC",
    "TTGGGGACTTCC",
    "TCGGGGATTCAT",
    "TCGGGGATTCCT",
    "TAGGGGAACTAC",
    "TCGGGTATAACC",
]

print(Score(motifs))
print(ProfileMatrix(motifs))

30
[[0.2 0.2 0.  0.  0.  0.  0.9 0.1 0.1 0.1 0.3 0. ]
 [0.1 0.6 0.  0.  0.  0.  0.  0.4 0.1 0.2 0.4 0.6]
 [0.  0.  1.  1.  0.9 0.9 0.1 0.  0.  0.  0.  0. ]
 [0.7 0.2 0.  0.  0.1 0.1 0.  0.5 0.8 0.7 0.3 0.4]]


In [9]:
motifs = [
    "GGCGTTCAGGCA",
    "AAGAATCAGTCA",
    "CAAGGAGTTCGC",
    "CACGTCAATCAC",
    "CAATAATATTCG",
]

print(Score(motifs))
print(ProfileMatrix(motifs))

28
[[0.2 0.8 0.4 0.2 0.4 0.4 0.2 0.8 0.  0.  0.2 0.4]
 [0.6 0.  0.4 0.  0.  0.2 0.4 0.  0.  0.4 0.6 0.4]
 [0.2 0.2 0.2 0.6 0.2 0.  0.2 0.  0.4 0.2 0.2 0.2]
 [0.  0.  0.  0.2 0.4 0.4 0.2 0.2 0.6 0.4 0.  0. ]]


In [11]:
# Stepik 1.3 - Exercise Break

# Compute the entropy of the NF-κB motif matrix (reproduced below).

def Entropy(motifs):
    Profile = {}

    # Check motifs length
    list_len = len(motifs)
    L1 = len(motifs[1]) # length of the first motif
    print('There are {} motifs of length {}'.format(list_len, L1))
    
    for i in range(len(motifs)):
        if len(motifs[i]) != L1:
            short_motif = motifs[i]
            short_motif_len = len(short_motif)
            print('Oops, Motif {} is {} nucleotides instead of {}!'.format(short_motif, short_motif_len, L1))
            break
        
    # Fill positions with frequency of 0
    for nucleotide in 'ACGT':
        values = [0] * L1
        Profile[nucleotide] = values
        
    # Iterate through each position, counting nucleotide frequencies
    total_entropy = 0
    for key, values in Profile.items():
        for motif in motifs:
            for i in range(len(motif)):
                if motif[i] == key:
                    Profile[key][i] += 1
        
        # Convert frequencies to probabilities
        for i in range(len(values)):
            Profile[key][i] = Profile[key][i] / float(list_len)
        
        # Calculate total entropy (Sum of (Prob_value * log2 Prob_n))
        import math
        for value in values:
            if value > 0:
                total_entropy += abs(value * math.log(value, 2))
            else: continue
            
    return(total_entropy)

motifs = [
"TCGGGGGTTTTT",
"CCGGTGACTTAC",
"ACGGGGATTTTC",
"TTGGGGACTTTT",
"AAGGGGACTTCC",
"TTGGGGACTTCC",
"TCGGGGATTCAT",
"TCGGGGATTCCT",
"TAGGGGAACTAC",
"TCGGGTATAACC"
]

print("Entropy Score =", Entropy(motifs))

There are 10 motifs of length 12
Entropy Score = 9.916290005356972


In [12]:
# Stepik 1.4 Implement MedianString

# Input: An integer k, followed by a space-separated collection of strings Dna.
# Output: A k-mer Pattern that minimizes d(Pattern, Dna) among all possible choices 
# of k-mers. (If there are multiple such strings Pattern, then you may return any one.)

def Number2Symbol(index):
    nucleotide = {"A":0, "C":1, "G":2, "T":3}
    for k, v in nucleotide.items():
        if v == index:
            return k
        
def Number2Pattern(index, k):
    if k == 1:
        return Number2Symbol(index)
    prefix_index = index // 4
    r = index % 4 
    symbol = Number2Symbol(r)
    prefix_pattern = Number2Pattern(prefix_index, k-1)
    return "".join([prefix_pattern, symbol]) 

def DistanceBetweenStrings(pattern, dnas):
    if type(dnas) == str:
        dnas = dnas.split(" ")
        
    distance = 0
    k        = len(pattern) 
    
    for text in dnas:
        H_distance = float('Inf')
        for start in range(len(text) - k + 1):
            end    = start + k
            string = text[start: end]
            
            if H_distance > HammingDistance(pattern, string):
                H_distance = HammingDistance(pattern, string)
        distance = distance + H_distance
    
    return distance

def MedianString(genome,k):
    
     # If genome string, change to list
    if type(genome) == str:
        genome = genome.split(" ")
    
    distance = float('Inf')
    median = ''

    # Iterate through string patterns
    for i in range(4**k):
        pattern = Number2Pattern(i, k)
        cur_distance = DistanceBetweenStrings(pattern, genome)

        if cur_distance < distance:
            distance = DistanceBetweenStrings(pattern, genome)
            median = pattern
    
    return median

# Sample Test (output = GAC)
k    = 3
genome ='AAATTGACGCAT GACGACCACGTT CGTCAGCGCCTG GCTGAGCACCGG AGTTCGGGACAG'

print("Sample Median String =", MedianString(genome, k))

Sample Median String = GAC


In [13]:
# Sample Test (output = 5)
pattern = 'AAA'
dnas = 'TTACCTTAAC GATATCTGTC ACGGCGTTCG CCCTAAAGAG CGTCAGAGGT'

print("Sample Distance Result =", DistanceBetweenStrings(pattern, dnas))

# Random Test
pattern = 'ACTAAGA'
dnas = 'TACTATAGTGGGTACGGTGGTTAGGGACTGATTAGGCGGGTGATGCCAACCGCCTGCCCCTTCACTCTAAGAGCGTAGCAGATAGCTGGGGACTGGTC ACTATTGCTATTACTCTAGAGAAAAATTATCTGCACTATAGCTAGGCCGCAAATCGCTACGAGAAACACCCTCGTGAGCAGACGTGGTATTAAGCTCC GACCAAACCCTCGCCAAAACGATCACGCGGTCTCCTGGTTTCGATGTCGCGCACTTGTAAAGGGGAGTTAGATGTAAACAAGGCTAGCAGTCGTGTTC CTTCGCTGAGTTGGCCGCTGGTGGACGACAACCAACTTGGGTCAAGTTGGGACCAACCGCGCAGCGATGAATGAGCGAAAATCTGTTAAGCGCCGCGG TCGTCCCAAGAATATTGGGCCCCATGCTGTAGAAGTAGATGCATAGACGCTGGCCAGTCTCAGCATGACTCACGACATCCTCGGTTGCGAGCAGAAGG GAGCGATATCGCTCCGTCATTTGCGATTGATTCCCAGTACAACGGGTGGTCTTGGACGTACCTAGTAATACAGAGCACGTCTTGTTGCGAATATAATC GTTACGCATTAGACCGGCTATCTTCCAGCAGATGCTGGGCCCTCTGGTGTTAAGGATCACTCTAAGATGCCTAGCGGATTAGGCTGAAAGAACAACCG CAGACTCACCGGGTAACCGAGTTTGAACGGCAATTACACAGAATATCTGGGCCGAAGTGTGGCGGGGACGGACCAGTTCGCTGCCTACTCACTTTCTA CCTCTGTGAGGTGTATCACAGCAGATGGAAAGTAGCGATATAAGCTAGAAATCAGACCATGCGTCCAAGGGAACACACGGACCTGACCTATTGTTCTC ACTTGCGGGCAGATGCCGATTATAATAAGTGCATGTTTCCTTGCTAGCCGTACGTCAGCATCTCAAATTTGATGATCGCACATAGTGTTCCCTTCACT TAAATGATGAATTGTAATGCTCGGGAACCACGTGTCCTACGAGCTAATAGATTAAGACTAACTTACCGCGCAAGAGATGTATTATCGTAGTCCTCCGT CTACACTCTTGAATTCCTCGCTGGTAAAGGCTACTGTGTAGCGACGTGTTCTTACGTACAGGTGTGCTCATAATATAAGAGCTGCTAGAAGAGAGTAT CAACAAAACGTCGATACGATGTTTGGATGTATATACGACGTTACGTACATGTGTAATGCTCGTCGCCTTAGTATAGCGCGGTAGTGAGGAGAGTCCAT AGAAGGAAGCCTGTTTTCGTCGGTCCCTACGCGTCGAAGGCGCTCTTTCGGATCTTGGCAAAAACAAGCCCCGGAAGACCCAGGAGCATTGTTTGTAA CGCGTGGAATTTTCGCGTTCGTCCCATGTTTGTTAAAAGGACTTGATTGTTAAACCTCGGACTAGCATTCCAGGTTCCGGTAGGGTACGCGCTATAGT ACTATGACGACGTTCAGGTCTTTCAAGAGTGTGAGGAAGAGAATACTGTTCAAGCTTATTTCGCCTAGCCCGGCCAGTCACAGATTGCACGGTTGAGA TCCGAGATCATTCTCGCATAGGCGGACTTATGCTTCACAAGATGCAGGAATCGGAGTCACTTACCTCATTTCTGGTTGAAGGGGAATTTTATATATTG AAATCTTTGGCCCGCTTACCGGTGGCTCAAAATGCCAGCGCGGGAATGGTGCGGATGGTCCTAGACGATTCTCGTCGCGAGTCACGTACATACTTTCT TACCGATATACCTAGGCCCGTATGCTATCGAGTGAGTTTGGCCTTTGTGAGCCTTCGTAGGAAGCGTCATGTCGCAATCACGGATTAATGGCCACAAT GATAGCTCGGAGAGACGTTTCGCTCTTACAAGATCCTTTCTAAAAGAGTTGTGATCTTGCAAAACCTAGTCAACAGGGCAATGGGCGATCCAGGGAGC CAAGCCGGATAACGTTAAACGCAGGGGTATCCCGACCTGGAGCGTAGACAGCGATGAAGCATAGGGATTTCGTGGACTTGCGTGTTTAACCGACCATC AATTCTAGATACACTCTCCCCAAGCTAGATTTACCGTGGAACGCAGCACACTGAATAGGAGACTTCCTCTTGTCCTACTACCCCACCGATTATCCATT TGATTATTGCTCGTGCTAGCCTGAAGAGTCGTCTAGGGACCATTGATCTGGACTATTCGAGCTCGGATCATGCTTTTCATATTCTCATTTCTGCTAAC CGTTTGATGCGTCTATCAACGTCGCCCGAAACTCTTTATCGTAGACATCTGGAGAGATAGTACGAAGTGTAGTCGACTTCGTGCCATGGTATAGAGGT CCGGTACAATGAAGGAACTCTTGTACAAATGAACAGCATGGCGGCGTTACATAGTTGACAGGACTGGATTGCCTTACCAGGGAGCGTCAGTATCGAGA TCCTCTTATGCCCTCCAGAGATGGAATAACGCTAAGGCTATTTTGACGAAACCCCTTAGCGCTGCTCCTCGCTAGCGGTGGCTTTTGGTATCTGGTTC ACATACTGAAATCTTCCAGAATCCTTCTGTGAAAGAAAAGACGAGAGGGGTGAAGCATATTAAGGGGGCCATCTGCCTACGTGAGGATCGCAGGGAGC CACGCGAGGACTCCGGCGAACATAATGCAACATCCTCGTCGGGGCTTACGTCAACGAGAGGTAAACGCTAGGAGAGATATTGATTCGACCGGTCCTCC CCGAATCCGTAAACGTCTATGCAAAAATCCGTCAAGTAGGACTCGAACCAGCGAGACCTAGTTTCTCCCTAGGTCGCCTACGCGCAGGCGGTCTCCAC GAGTGCCTGTATATCGGCCATCGTTATAGGTGATCTCAGTATAACTCGGACCTTGTTTCCATTTTACGTCTTCCTGTCAAAACCAGGCGCCTAAAATC'

print("Random Distance Result =", DistanceBetweenStrings(pattern, dnas))

Sample Distance Result = 5
Random Distance Result = 64


In [14]:
# Randomized Test 
k    = 6
genome ='AGCTATTAAATGGCGAAGCTATCTATTCTCTTATGGATATGT CAGTTATCCGCATTTCACTGGGAGGATCTTTTATGGTCTGGT GTTCATCAGGAGATCAGCCAAGTCCCGAGGTGATGGGTACAC CTTAGTCTATCAAACACGGTAGGGTAATGGTTGCATAACCTA GAAGAGCGGTGTGATCTATATGCACGGTAGTTTCGTTTATGG TGCCAACTGCGATAATGGCGCAGACGCTTCTTAACTCCCCGC GCCTAACCGGCACATGTATTATGGCGACAACGTGTGAGGGGA ACGGCACTCATGGGGCAACGGCGGGAGTCATCATGGTTAAAG AACACAGTTAAACGACCGCATGACATTTGATTATGGTCCACG GCGCGCGGAGCAAAGTAATTGGTGTTATGGCGAGATTGATGG'

print("Randomized Median String =", MedianString(genome, k))

Randomized Median String = TTATGG


In [15]:
# Stepik 1.5 - Compute Pr(text, profile)

# Input:  String text and matrix dictionary profile 
# Output: Float compute value

def ComputePr(text, profile):
    p = 1.0000
    for i in range(len(text)):
        p = p * profile[text[i]][i]
    return p

text = 'TCGTGGATTTCC'
profile = {
    'A': [0.2, 0.2, 0.0, 0.0, 0.0, 0.0, 0.9, 0.1, 0.1, 0.1, 0.3, 0.0],
    'C': [0.1, 0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.1, 0.2, 0.4, 0.6],
    'G': [0.0, 0.0, 1.0, 1.0, 0.9, 0.9, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0],
    'T': [0.7, 0.2, 0.0, 0.0, 0.1, 0.1, 0.0, 0.5, 0.8, 0.7, 0.3, 0.4]
}

print('Sample should be 0.0205753ish =', ComputePr('TCGGGGATTTCC', profile))
print('Exercise Compute Pr =',ComputePr(text, profile))


Sample should be 0.0205753ish = 0.020575296
Exercise Compute Pr = 0.0


In [17]:
# Stepik 1.5 Solve the Profile-most Probable kmer Problem

# Input:  String text, integer k and matrix dictionary profile
# Output: String most_probable_kmer

def ProfileMostProbableKmer(text, k, profile):
    max_prob = -1
    most_probable_kmer = ""

    if type(profile) == dict:
        for i in range(len(text) - k + 1):
            kmer = text[i:i+k]
            probability = 1
        
            for j in range(k):
                nucleotide = kmer[j]
                probability *= profile[nucleotide][j]
            
            if probability > max_prob:
                max_prob = probability
                most_probable_kmer = kmer
    
    if type(profile) == list:
        nucleotide_to_index = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

        for i in range(len(text) - k + 1):
            kmer = text[i:i+k]
            probability = 1

            for j in range(k):
                nucleotide = kmer[j]
                nucleotide_index = nucleotide_to_index[nucleotide]
                probability *= profile[nucleotide_index][j]

            if probability > max_prob:
                max_prob = probability
                most_probable_kmer = kmer

    return most_probable_kmer

text = 'ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT'
k = 5
profile = {
    'A': [0.2, 0.2, 0.3, 0.2, 0.3],
    'C': [0.4, 0.3, 0.1, 0.5, 0.1],
    'G': [0.3, 0.3, 0.5, 0.2, 0.4],
    'T': [0.1, 0.2, 0.1, 0.1, 0.2]
}

print('Sample w/ dict should be CCGAG =', ProfileMostProbableKmer(text, k, profile))

profile = [
    [0.2, 0.2, 0.3, 0.2, 0.3],
    [0.4, 0.3, 0.1, 0.5, 0.1],
    [0.3, 0.3, 0.5, 0.2, 0.4],
    [0.1, 0.2, 0.1, 0.1, 0.2]
]

print('Sample w/ list should be CCGAG =', ProfileMostProbableKmer(text, k, profile))

Sample w/ dict should be CCGAG = CCGAG
Sample w/ list should be CCGAG = CCGAG


In [18]:
# Stepik 1.5 Greedy Motif Search Algorithm

# Input:  String dnas, integer k (kmer length) and integer t (sequences)
# Output: matrix profile

from collections import Counter

def ProfileMatrix(motifs):
    n_seqs  = len(motifs)
    k       = len(motifs[0])
    
    profile_matrix = np.zeros((4, k), dtype='f')

    for i in range(k):
        base_i = []
        for string in motifs:
            if len(string) >= k:  # Check sequence length
                base_i.append(string[i])
        count_base    = Counter(base_i)
        profile_matrix[0, i] = count_base['A'] / n_seqs
        profile_matrix[1, i] = count_base['C'] / n_seqs
        profile_matrix[2, i] = count_base['G'] / n_seqs
        profile_matrix[3, i] = count_base['T'] / n_seqs

    return profile_matrix

def GreedyMotifSearch(sequences, k, t):
    best_motifs = [sequence[:k] for sequence in sequences]
    best_score = float('inf')

    for i in range(len(sequences[0]) - k + 1):
        motifs = [sequences[0][i:i + k]]
        score = 0.0
        profile = []

        for j in range(1, t):
            profile = ProfileMatrix(motifs)
            most_probable_kmer = ProfileMostProbableKmer(sequences[j], k, profile) 
            motifs.append(most_probable_kmer)

        best_score = DistanceBetweenStrings(best_motifs[0], best_motifs[1:])
        current_score = DistanceBetweenStrings(motifs[0], motifs[1:])

        # Can also use
        # best_score    = Score(best_motifs)
        # current_score = Score(motifs)

        if best_score > current_score:
            best_motifs = motifs
            
    return best_motifs

# Sample Test
k    = 3 
t    = 5
dnas = 'GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG'
sequences = dnas.split(" ")

print("Sample output (CAG CAG CAA CAA CAA)?", GreedyMotifSearch(sequences, k, t))

# TODO: Not returning expected results, Need to investigate online


Sample output (CAG CAG CAA CAA CAA)? ['GGC', 'AAG', 'CAA', 'CAC', 'CAA']


In [20]:
# From https://github.com/andmedina/greedyMotifSearch/blob/main/greedyMotif.py

# Input:  A list of kmers Dna, and integers k and t (where t is the number of kmers in Dna)
# Output: GreedyMotifSearch(Dna, k, t)

def Profile(Motifs):
    k = len(Motifs[0])
    nuc_freq = {symbol: [0]*k for symbol in "ACGT"}
    for row in Motifs:
        for index, char in enumerate(row):
            nuc_freq[char][index] += 1/k    
    return nuc_freq

def GreedyMotifSearch(Dna, k, t):
    if type(Dna) == str:
        Dna = Dna.split(" ")

    BestMotifs = []
    for i in range(t): #range over strings in Dna
        BestMotifs.append(Dna[i][0:k]) # sets best motifs to first 3-mer in each dna string
    n = len(Dna[0])
    for i in range(n-k+1): #range over length of a string in Dna (all of them are the same length)
        Motifs = [] #empty list of Motifs
        Motifs.append(Dna[0][i:i+k]) #get Motif from first string in Dna consecutive possible 3-mers
        for j in range(1,t): #range over from Dna[2nd string] until the end of Dna strings
            P = Profile(Motifs[0:j]) #Get profile of probabilities for Motif previously obtained against the strings below  
            Motifs.append(ProfileMostProbableKmer(Dna[j],k,P)) #append most probable k-mers found in Dna strings 2 to 5 to the list based on the motif obtained from Dna string 1
        if Score(Motifs) < Score(BestMotifs): # create consensus string, and then compute dissimilarity scores for current Motifs and previous BestMotifs, if the current is less (i.e. better) than the previous. 
            BestMotifs = Motifs #update the best as the current Motifs.
    return BestMotifs

# Sample Test
k    = 3 
t    = 5
dnas = 'GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG'

print('Sample output (CAG CAG CAA CAA CAA)?', ' '.join(str(e) for e in GreedyMotifSearch(dnas, k, t)))


Sample output (CAG CAG CAA CAA CAA)? CAG CAG CAA CAA CAA


In [None]:
# Random Test
k    = 12 
t    = 25
dnas = 'GGCCAGGTTGTCTTCAGATAAATGGAATTGCAAGTGGAGAGGCGGATGCCTGACCACAGGAACACGAGCACCGAACTCCGCCTCATACAGCTTCACACCACACGATATCCATAGGCCGCGTACGGACCGTGGCCTTGGCTAGTCCGCTGCGGGAAG AAAGGTTCCCTGATAACCGAATTACGGAAGGCATGTCTACAGTATAGTTTCAGATATAGTTATTCTAGAAAAAGATGTAGGCCCCTTCAGCGTATTACTCTACACAGTCGGAGTAAGAGTATATAGTTTGGACTATTAATTCAATGTCACGACAAT CTCTCATGTCAAGCTCCGTCGCCATCCTCTGGTTCGCGAATTGCATTGTTGCTACGTAGGCACAGGGCGACGGTCCAGCCGCGCGATCCACGATGCGAGGGGAAATTCCTCAGATAGATGGGCAGGTTATTGGTTCTAGTCCGTTGTGGGACGAGT TAACCTCATAAATTTATTACTCTAGTTGATTCACTCCTATTATTGCTTGTAAGAGCTATCGCTAGCGACGAAGTCGGCTGACGGCAACATGAACTCTGGAACGCTACATCTACCGCGACCATCAGATAGAAACATGCTCCACCAGCGACGCTAGCA AGAGACTCCGTCCAAACTCTGGACTGATGTCTCGTCCCACAGCACCCGGGAATGAAGACGTCTATACCCCTTAAGAACGATCCGCTCAGATACACCGGGTGAGTTATAGATTAGACAAGAATGTAGGCGGGCTCCATTAGCAGACAATGGGTGTTA AGCGACTTCAAGAAGAAGGCGACTTCAGCATGGGATGCGAATAAAGTAGGAGGTGCGGGTTTGGAATACAATGATCCCATAATCTTCAGATATAGGTAGGCGATATTGACAAGACAGATGTATAGTGGGCGCAGAAGCTGTGCATCTGAAAACTTA CCTACACTTGGCCAGCGATCCTCAATTATTGGTCGATTTAAGCCGTTAGTCAGATACAATGTGTATTCTGTCACGCGCGGTTAAAACGGCGGGTCACGCACATAAGGTCTCGAGGCCCTTAGATTGGCGGCTAAGCAAAGCCTAAAGTTCCGATCC TGTGTACGCAATCATATCGGACCTAATTGGATGTACATCAGATATATGGCGAGCTTGGGCCGTTCTAGCAGTGGCTTTCTAAGGCAGTCGTACGGAGAATATGTGGTCAAGTTTGTCGACTGAATCGTAGTACTTGGCACGCAGCTTAGCCACAGC TGACTCCGTTAGCAATTGCTTAGCCTCAGATAAACTTGGCGTGATATCGGCACGGAACGTGCCCTGAGTGCTGTTGGGGCAGTGGATGTGAGTGCACCCGTTTTATGTCGTGAAATGCCTGGAGTGATCGTCGAAACGACGTGCTAAGTTGGCCGA TAAATTTGTGCAATGACGGGTGGTACAAAACTATTATATTTTCGCGAGGATCCAATGCCCTTCAGATAGACAGTCTTCATGAAAGTCCTAATAGGTGGACCCTAGGGCTGTATCGGTCGGTATTTCCAAGCTCACGTCACGAGTAGATACACTCAC GTTGTCATGTCATGGGTCAACGTGAAAGACGAGTACCGTAGCACGGTATAGTCTATGGCCTATAGTTTGTCAGTCTTTGACCCAGCTACTGGACGGAATAGATTAACTTGTCCGAGGCGAATCAGATAAATTTCAAGTGACCATGATGATGCTTTC TCAAGTTCGTAGATCGAGGGTTGCGGATCTCAGCTCCGACTTACGTAGAGAATAGATCTCTTGCGTTCAATGGAGAAAGGGCTACCCATAAGCCATACTAATAACCCTATGGAGTGACGAGTCAGATACAGCTGTTAGCGGTACTGCGTAGACTCG TGCCCACCACATGTCGTGAAGCAGACTTGCATGCACGACATGAACTTAAACGCGGCATATCACAAGGTACTAGTGCCCTCTGCATTTATGTAGCGTAACCAATGGAAAATACGAATAGGCTGTGTACAGTTTTTCAGATACACAGTCAAGCTGAGG GTGACTTTACTGCACGTTGCTTCCAGGAACTGCCGACTTAATCGTGAGCGAAAACTAGTTCTCAGATATAACAAGTTCGAATAAGAATTTCACAGACCGGAATCGGAGGGATAGCCATAAATCTGGGTATGATGGATAGGTCCTGTCAATATTCGA TTTTGTAATGAGTTACGCTTTGCTTGGCTAGCCTTTGACCAAGTATATCCCATTTGCCTCCAACTCCGCATGACTCTGCCGGCGTCTACAGTATCAGTCGATAACAGGGGCCGCCCATCCAAACGTTCAACAGTCTTTTCTCAGCTCAGATACAGA GCGATGTGCGTTGATCGAAGGTAAAGTCCCGCGACCCACATTGCCGCTAATCTTCATTTGATCGTTGTAGAGACGTCCCGCGTAGAATCATTATCGATCAGATAGAACGCTTTAACGACCACCACTACTCGCTAGCCTTAACTTACGTTTGTCTGT GAAGTTTAATATCGAAACTTGGCTACATTGCCGTATAAGAGCTTACCGGCCCTACAAAGGATAGGAGTCAGCGCGGATCCAAATATCAGATAAAACCATCCACACTAGGATAGCAAGGCCATGCGCGGTGCCCTCAAAGAATAAGTGTGGAAGGGA CATTCCCTGACAATACGATGCACGGCTACAGTCTAGGCACTGGATACCTACGCGGCTGGCGTATCAGCAGCGTGGTTACATTAGACTTCCGGATGACGTCTTTAGGAAATCAGATAGATTATGGTTATCACTGGGCAACGCATCTCTATTGCTTCG ATGATTTTTCCGCAGTTTGCTTCTGGTTTACGTTTCCGCGGTTCTAGCTTCAGATAGAGTAAATTTCGGGGTCGGAGGTCCTCCACGATATTTGCGAACTCTTCGGATAGACATTTAGGCGCAAGCTCCGGCTCGCTATTGAAATCTTAGCGGGAG TTTTTGGATACGTGCCTAGCTTCCGCCGCAAACCATTCATCAAAAGCGTTCGAGGGAGGTTTGACTTCTGCTTTCAGATAAAGACGTCGCTATGCTCCCGAAATTAGCCTCGCGTATTTGCCCATATGAGCTAAACCGAACCTTGGAACTTTGTTT GGTAGGTAACCGAACATAGTAACATTACATGTCAGCTGGTTCCAAAGGGAGTCTGACTCCCCAGCGTTCTATGAAAGGGTACGCCTCAGATAAAGGGTCCGAGTTTTCACGGGTTGCGTACGGACATGGTCGGGGTATTAGGTGTACAATTACACC GCCAGCCTGAAACAGTTCTGGGGCCTCAGATACAATCCGCTTTCAGTGAAGCGCCCGGGTCGCTCTACTGTCAGACTCGAGTCTCACATATTTCCCCCAGAACGGCAGTCATGGGTCAGACATGGGAAGCAACAAGAGCTCTGTCACACTACGGCG AAGCCTCATTATCGGAATGCAGTATGGTACTCCCGCGTCAGATAAAGGTGCGGCCATCAATCTCTCTTGGTCTTAACACTCGAACTTAAATTACAAATCTATTATTTCGCTGTAGTAGCCCTTCGGCATACTCGATATCGCCAAATCTACGCTTCT GCCGAGCGCAACGTCTTTGCAGCGCATGTGTTAGTGACGTGACGTATGTTCAGATAGATTCGTTTACAACAGAGTTAATTGTCCAGGATAACACTCCATGAACCTGTACAATTTCGACTCTCACACGTCCGTGCCCAATGGAAAACCCTTACGGCG GGCGCTTCACTACAATTTAACTTAGACGAAAGCGCGTGCCACGGTACACTTACTGAGCCAACTGCGCGTGCCGAGGAGAGGAATACGTCTGCTTTCGTCAGATACACCGTAACCGTCTTCAGTCGATATCCGTCCAGTGTAGTCGATTGCTGCGCA'

print('Random = ', ' '.join(str(e) for e in GreedyMotifSearch(dnas, k, t)))

In [15]:
# Random Test
k    = 12
t    = 25
dnas = 'AACGGCGACACGGTCCGACTGCACAACCAACTATCGCTAATGTTCCTCGCTACTTTAAAGTAATTGGGTCGCCATTGAATACAGCTCATCTATGCGATCTACCCTCGGTGAGTGGTGCATAATACGTGAAGTATTAAATGTTCTGCTTTAGCAGCA TAGAGAGATACGACCTGCGGAGAGGCTCTGACATAATTCTCGAATCCCCATGTCCTACGTGTTCATGACTATCAGCGCGGCCCCCTCTACTCTGGGAAGCCCGTGTACTGGTAAAACACATCATCCCTATCCACGAGCTGATTAGAGGGCGGGCAA GGCACCCCAGAAGGATCTGAGACACAACTCTCATCTCTATCCATTTGAGGGGCGTAAATCGGCAAGTATACACACCCTCGGGTTGTACCCCAACTAACTCTCAGCATATTGCAATGCCTCATCTACTCTTTGCCCCTACCATTTATGGGACTGGAC GTTCACACACCCAGAGGCCAATTTATGAGATAAGGACACAGCAGATACGTGATTTCATTCCCCTGAGGGTCGCGCTACCAAGGCGCATTAGCCTCATAGAAGCGGAGGCTACCTGGGAACTAATTATGGGTAATCTACTCTTGGCCTAAAGCCATA CCGTGTGAGAATCGAGTAGATCGATAACAGGTAAACCCTGGCTGCTAGTTCTACGCTTCGCCTTTTCCAGCGACAGGGAGATGTTCACTTAATCTGGGCCTCACGAAAGGTAGAGCCCCGCACGGGACGCCTAACTATCTGAATGTATGAAAGTTG ATCAGAACCGGGCTCTACGCTGGGTATGACTTTTCAAGTCGATTCGGCGTTAAAAACGCGCGGGCCTCCTGCTTGTAATTTGGTTGTTGGTGTATCGGCCTCTAAATGTCAGTGCGTATAAGAGGCAACCATGGTCTAACGCTCTATACCCATGTC CGAGCCGGTGAGGGCTTGTCGCAGCAACTATTAGTGATTTAAGGTGCGCGACAGCGTTCTCGGCGTATTGTCGTCTCCCGGTACGTGGGCGGTCAACAATATATATGACTCTACTCTAGGAGATGTTGGCGGCGAAATTGATGGTGCTAGGCAAAG GAGTGTTCTGTCGAAGCAGGTTCATCCGCCATTGGGTTCTACGCTCTGTCTCATCACTGGGCTAGAGCTGTCTGATACTATTCGCCCTTGTTGAAGTCGGTGAATACCTGGAGTCGGGATTACCAACTCTGAACGCATATTATGACGTCGAAACCG TCCGTATCCCCCCAGGGCCCGCGATATTTAGCTTGTCCCCCCTTGCCCTTCACACGGAATGTCGAGGTTGTATATGATGAGCAGATCTACGCTTTGCGCATAGCCCGTGGTGTGGATTTTCCTTCTGCCTTTGGATCTGAAGCCGTCCGCGCTCCA ATAATGTCGGTTAGGCTACAGCTAGATTATTTGCCTTGTGTATACCCACGACACCATCGTCGATATCTTGCAACTGTGCTCATTCCAAACCCTTGTCAATTCCCGACACAGAAAAGGGCTTTCTACACTAAGTTAGAACTTTAGAAAGGTACGTAG GCATGGCATGCCCGTCGACCGGTGCTCTACACTTAGGATACTTAGTTCTGTGTAATGTCTATGCTTATCCCAGATTAAGCCCATCAGGGCACCGTAAGCGTAAGCCAGATGTTGAGCCTGGGCCATTCCGAGCTTAAGGTCATTGATTGCAATTAG GCCGGGGTAAACTGCGCGTGGTTGAGGCAGCCCATCACGCTTTATCGGCTGTTCGACCGTTAGTCATCGTTACTCTACCCTAAGCCGGAGAATGGGCAGACATGTCTACCCAAGGCGGCAAACGACGATGTGTACGTACGACCCGCAGAGGCCCGA ACTACGTTGAAAGTCTACACTCTGGCAGGTCATGATCTAAAGCGAGTCATTGTGCTCTTGCGTTGCAGGAGACCTCTATCAGTATGTCACATAACGTCCAATCATTACGTCGGCATCTTTTATTGCGCAGTTTTCAGGCCCCATGTAAGACTGTCG ACGGTGGTTGCACGGGTTTTCAGGCCATGCAAACACGCCGTCAACGTCCATGACGAGTTTATCTACTCTATGTCGGGCATCCTTACACTTATACAGATGTGCTCCTGACCCATTTACACAACTCCGTTCGCAGAATCCACACGAATGAATAAGCAT CCGCATCGCTGTATCCGATCGGAACTCTACTCTGGGTGGAGGGCTCTCTCCTTAACGAATGACATGCTTGATCATTAAAGTGCAATCGACGAGCGAATCTCGCACCCTTTATCCAACAATCGAAGCCTGTTACGGACTAGCGGGTCCGAATATGGA TGGAATCCCGCGTTCTACCCTGTGGTAAAAGGGCATGCATGGCTCCCGTAGGCTACTCGTGAAGGCTGACTAAGGAACGGCGGTATGCAGGCCAAACTGGCGGTCCATATCGTAGGGGGGCCTCGACCCGTAATATTACATGCCGCTATTCCGGCA ACCCTACTCGCTTCGAGAATACTCGTCTACCCTCAGGGTAGTTCTAGCCAGAACAATACAGGCTTCAGCAGAGGCTTTGCCGCGACTCTCTAGGGGTCATCACCGCTTGTGCCGGCTCCAATCTGATCTGAGGAGCCGAGGCGGAGGTCTTCGGTT TATTCTGGATAAATCTACTCTTTGTCTGCAGGAGGCATCAGCCTCCAAACAGCTATATTAGTCCAGTACTCTTGTCCTACTTAATCGACTGTGGCATAAGGTTTGTTCGCACATAAACCGGGAGTTCTGATCAACTGTGGGTTCTCCTGTCGATGG ATGTTTCAGCCGGATTTCTAGCCGTAACGCTAGTTTTCCGATGAGCAAAGTGTAACCCAAGATGTAGTTACCCTCCGATCAAGCCAATGAGCATTGATCTACCCTGCGGATACTCCGGGAACGTATTAGTAGCAGCGGCCTCTTTGACCATCTCCG ATTAAGATTCACGGTGGCAGCAAAACACCATAGCAGGATTCCATCTTATTGCTCGAAGGACTCTACCCTAGGATGCCCAATAGAACTACCAACTTTGGGGTATATGGATGGGTCCTAGCAACGCGCCCCACGTGGCCCTGGAATAACCGAATGTAG AAGATGCAACGGTCGACCAATTGTGTCTACTCTAAGAACTGTGAGGCCACTAGTTCATGAGCATTAATCGTCGCATATATTCGCGCCGTGATGGAATAAAGTCGTGGCATCCAGTAGAACACCGAAAGTTTTTTCTTTTAGACTCCTCGTCCGAGG ACTTCAGTTAGGCGGAGCCAAGGAACCACCTAGCACCATACACACCCCAGGTCCGTAGCGCAATGTATGAAAGCATAGTTAGGGGATTTAAACGCGGCTTAACTCCTAGTCTACCCTCGGGCATTACCCTTGCTTCAAGGGTCGACCCAGTGCCAG AACTGCTGGGGCATCTTGCTGTTTAGTCAGAGCGCCTTTCTATATATGCCGCGACAAATCCTACTCGTGCCCAACCGGAAACGTCCGCCGCACCTAACGCCGCGCTCTGGGCTACCACACAGATCTTCTTCCCGTATCGATGATCTCTACTCTTAG GGCTGATGTGCAATGAGGTATTGCCTCTGAGGTGCGATCTACGCTGAGATTCGAGTGACATGGCTAGCGCCCAGCGTTGCAGGCGATACGGACAGTCACCGCCGGTTCGGAGTATTGACGGGGGAGTGAGAATTGCGAGTATAACTCGACGGGAGA TTTGTAGCAGCGTTAACTATGGGCGCGAACCTTATCGTCTACCCTCGGGCAGTTCGGCGACCCATATCTCAACACCTTATTAATGTTAATCCTAGATTTGGGCATGACAATTGCCTCGTATTAGTGGCAGCGGCACCAATCCCACAAGCCGGTTCT'
sequences = dnas.split(" ")

print(' '.join(str(e) for e in GreedyMotifSearch(sequences, k, t)))

AACGGCGACACG TAGAGAGATACG GGCACCCCAGAA GTTCACACACCC CCGTGTGAGAAT ATCAGAACCGGG CGAGCCGGTGAG GAGTGTTCTGTC TCCGTATCCCCC ATAATGTCGGTT GCATGGCATGCC GCCGGGGTAAAC ACTACGTTGAAA ACGGTGGTTGCA CCGCATCGCTGT TGGAATCCCGCG ACCCTACTCGCT TATTCTGGATAA ATGTTTCAGCCG ATTAAGATTCAC AAGATGCAACGG ACTTCAGTTAGG AACTGCTGGGGC GGCTGATGTGCA TTTGTAGCAGCG


In [16]:
# Stepik 1.6 Greedy Motif Search Algorithm with Pseudocounts

# Input:  String dnas, integer k (kmer length) and integer t (sequences)
# Output: matrix profile

import numpy as np

def count_occurrences(motifs):
    counts = {'A': [], 'C': [], 'G': [], 'T': []}
    k = len(motifs[0])

    for i in range(k):
        column = [motif[i] for motif in motifs]
        counts['A'].append(column.count('A'))
        counts['C'].append(column.count('C'))
        counts['G'].append(column.count('G'))
        counts['T'].append(column.count('T'))

    return counts

def ProfileMatrix(motifs):
    counts = count_occurrences(motifs)
    k = len(motifs[0])
    profile = {'A': [], 'C': [], 'G': [], 'T': []}

    for nucleotide in ['A', 'C', 'G', 'T']:
        for i in range(k):
            profile[nucleotide].append((counts[nucleotide][i] + 1) / (len(motifs) + 4))  # Adding pseudocounts

    return profile

def GreedyMotifPseudocounts(dnas, k, t):
    if type(dnas) == str:
        dnas = dnas.split(" ")

    best_motifs = []
    best_score = float('inf')

    for string in dnas:
        best_motifs.append (string[0:k])
    base_string   = dnas[0]
    other_strings = dnas[1:]

    for i in range(len(dnas[0]) - k + 1):
        motifs = [dnas[0][i:i+k]]

        for j in range(1,t):
            profile = ProfileMatrix(motifs)
            most_probable_kmer = ProfileMostProbableKmer(dnas[j], k, profile)
            motifs.append(most_probable_kmer)
        
        # best_score    = DistanceBetweenStrings(best_motifs[0], best_motifs[1:])
        # current_score = DistanceBetweenStrings(motifs[0], motifs[1:])

        best_score    = Score(best_motifs)
        current_score = Score(motifs)

        #print(motifs, current_score)
        
        if best_score > current_score:
            best_motifs = motifs

    return best_motifs

# Sample Test
k    = 3 
t    = 5
dnas = 'GGCGTTCAGGCA AAGAATCAGTCA CAAGGAGTTCGC CACGTCAATCAC CAATAATATTCG'

print('Sample should be (TTC ATC TTC ATC TTC)?', ' '.join(str(e) for e in GreedyMotifPseudocounts(dnas, k, t)))

Sample should be (TTC ATC TTC ATC TTC)? TTC ATC TTC ATC TTC


In [17]:
# Random Test
k    = 12 
t    = 25
dnas = 'AGATAACCGCCCAATTTGGCTTAACAGAGCTGAATCATACTAATACAAGTGTATAGCATGACTTCTCGCTATCACCGTTAGGTATAGGTCCCCCTGGCCGCGAGACACCTAGGCAGGTAGGCTAATTTATAACGACCAAGCCTTAACTGCGGTCAG ACGGAGTGCGTGTCTCCCTGCTCGGACAGTGACTACAGGGAGGAGGCTCTATCCATGGGTGCCAATTCATCACAAATGTCTATAAATATCGTGAGGTAATGGGATCGAATTGCCGAAAGCTCCATACATAAGCAGTTACCCCTTCGGCCGGGTCGG GGCCAGTCTGGTACACGTGCTCCGAAGAGCTCTGATCGATCAGCAATCCATACGGTCAACCCAGCTTATACTCGCCATCATGTGTCCAATTGATCACTGAGCAGTTGCGCTGTTTATAAATCCGGCCATTCGGAAATAGCGAGTCAATAATCTACT GACCCAACTAATGGTAGGCCGATTAAGTGATGGTTAGAGAGTATATCGCGTGATCTACTGCGTCAATTGTAGCTTTTACTTTCGTCCAATTCATGAGTTGGTCATGCGCTGCGCGTGAGCTAGTGATCGGGGGAGGGTCTAGGACGATCAAGTCAT GCGGATCAAATTAGCTCATTTGCGTCGTCCTCTTGTCGGGTTCCCCCCCGGAGATGAGACAGATAGCTAGCCGCCAATTTATTAGCTTTCACTTGTCACTCGGAAGGGGGTGGCTCGAACAAAGGACTGACTGCCCCTGGGAATTAACCGTTTATT GCGACGCAATGAGAAACGGATGGCTAGAGAGGAGCTAAATTGTGCGTCACGCGACTAGTCGCAAATTGATCAAGAAAGATTCCTACTTATAGGGAGTAAGAGTTAACGAGTAGCAGGGCTAACCTGCTATCTTACTACTTCACGGGCACTCGTCTA GAGTGCGAAGTTAAGGCGTAATGCAACTCTTTAACTTTGAGCGCTGCGACTGGCCCTTAATCGACACATGGTCAGAACTGCCACACGGTCCTCGCCGCTGCGAACAATCAGTCCAAGTCACACCACAAGACTTCCACGCCGCGGCCGAATTCATCA ATTTAGGCCAAACCCTGGCATATACGGGTGTGCTTAAAAGTCAAGGGCACAAATTAATTAGGCATTGAGCAGCAAGCCGATACCAAAGTTCGCTCACGCACGGCAACTATCCCTTCCTAAGTACGACCACGCCCCATGTATAAGTCCCCCGATCGT TATTTCCAGTCATAGTACGAACCGCGCACTAGCTAGTGGTCCTTAGCAGTGTCGATCTTATCGTATGAGAAGTATGACCGTACGTAATAGCGGATGGTCGGCTGATAATTCTGCTAGACGTCAAATTCATTATTCTGGGGCGCCGGCATTATAACA TCGGCGACAATACTAGGGTTTGTAATGCTGACCAACTTTGTCTCCTTGGGGTCGCGGTTAACGCGCTGTATTCTATAAAAACTTTGACCTGGGACCTCTTTCCCTGGACCCAATTTATTAAGTCGGAGAACCACATGAAGAACAACAGAAATACTC GGTACCTCAGACGTAGAGATGAAGCAAATCAAGAGTTTTCTGACGGGACGACGTAGGTAAGAAACCGTCCAGCTAAAGATAAGGTCTTGGTCAAGAGCGCTGGAAACATGCCACCGAGGTTATTCTTAACCATCAAATTGATAATGCTACGTCTTC TCCTTAATATTCGGCTTTTCCCTTATTCTTGAGGTCTTAAAGGAATTCGCTAATGGCGACTCGTATGCGACGTAATAAATGCTTACGTGCCTCGATTCAAATTGATGAAAGTCCGGTGTCCAGTCAAATCCAGATATGGGAATGGAGCCAGCATGG GCCAATTGATAAGTAAAGATGCTGTACCACGTCCTCTAACCTCCAAGAGAATCGCCACTCTTGGAGGCACTTCATGTATATCACGGCTATTGGACTGATTTTTAGTAGGATGTTTCCGTTCGTCTTTGATTCAGGACCCTTTTTAGTCGCCTATGG ACATCTATCGTTACTGGACTAGGGAGGAAACATCAGTTTGGAGATACCTCTCTAAACGCGTACACACTAGCGTAGGTTTTTCAAGATACTCCAGTTTGCAGGGCTCTGGGGCCTGGTCACACTAATTCATTATGCGTGCATATAAGTGCGACAAAG TTCGATTGAATATCGAATTGATTAGTCCCCTAATCGAGTACCGGGTATACGATTAGACTGGTGTCGCTACTGTTACTAGCAGATCACATTTTGAGCCGCATGAGCATGAAATTCTTCCGAGCCCGCGACGGATGCAGCTCAAGCATGCAGGTAGGG GCGAACCCGATCAATTACTTCAGAAAGGTCGTTGGGGCGATCCTGCGACCAGTAGTAAGAGACTCTCGACTCTGCATAATCTCGCAGCTCTGGGTGACGTGACGATTCCGTATCGAAAATACGAATTTATCATGCCGTAGGCGAATCATGTTGCAT ATTTAGGAAACTTCCGTAGCGATCCCCTAATTGCTGCCTCATCTGAACGCTTCACGGACAATTGAGGAGAGGTGCACTTTGTCCGTCGGATTAGTCATGCTGTGGTCCCAACGAACGCCAAGAGCTCGGACTCAGTGGGAGGGTACAAATTGATGA GTCAAGAGCGTATCTTTACTATATACGCCTCATAGGCAAGCTATATTGGCGAATTCATAACTTCTAAGTCCGTAAGTAAACATCACGGGTTCGAGAGAAAAGGGAAGGTCGAATGGGGCAGGCTTGCGCCTCGATGTGTACCATTCCTATTTCATA TGTTTCAGAGGATCGAGTTAGCCCCCGAGTTTACATTGGGCGACAAGGATCAGCTTGTATACAAAGTGGATGTCGTCCGGAGTAAACAGGCTACGCCCCTGTGATCGCACCAATTAATTATAGAGGATCGTGTGAGCTTCACTCATGGTCCAAGCC AAACTCTTCTAAGTTATATTCCACGGGGATGGGTGATCTAATTTATAATAAATAGCAGGTGAGCATCCGCACCCTGGTACTTTATGCCTGGTTCGAACCCGCTGGCGTTACGTCTTCCCATAGTGCCTCTTTCGGCGCGCTCGCCCTGACTCTCGT TTATGTACCCTGATACTTCGACCGCTCCCCCCTGTAATTTTCACTTTGTGCGTACACTCAATAACATGTGTGAATTTACGTGTAGTGAGAGCATATCTGCGTACTGCCGACGGTCTTAGCTCCGTAGATCCGACTAATTTATGATCGTACTGAGTC GCTAATTTATAACATTAACAAGCGTAAATCAAATGTGAGCACGTTCTTTAGTTAGCAATGCCTTATTTTGTCCTAAGAACAGTAGGCTACAGTACCAGGGGGTATCCATACTGCAGCCATATGGTCAACATATAAGTGATATCATGATTTCTCAGC TTTATAGCCTATGCCGAAGTGTTGCTTTAGATCCATAGAACCTCTTCGCCTCTCAGGTTTACGAATTCATGAGTAAGTAGGCTTATCTCAGAGGTAGCGACCATGGTAACCCCCAAACGTTTTCTCCCAGCTTGTCGGGGAGTACCCCCCCCGGTT CGATGAGTACGCCCTCTTACATGAGATCGTGGAAATTTATTATGTGAAGCTGAACCCCTAAAGATCCCAAGGTCCAATTTATTACCTTACGGCAACTGCTCATTCTAAGGCTTGATCGGCCGCCGGACCCCTGTGTCTATGTGTGAATAAAGTTAA CTTCCCGCAAATACGTCGCTGTCCCACTCATACTTGGCGGCTCCGCGGACCAATTAATTATCGAGCCACCGTAGGCTCTTCCATTCAGGTAGCGCCACCGAGCTTGCTCGCTTTCCCACGACGCTTATGCGCTATGTACTTCGCAGTTGACAACAC'

print("Random =", ' '.join(str(e) for e in GreedyMotifPseudocounts(dnas, k, t)))

Random = GCTAATTTATAA GCCAATTCATCA TCCAATTGATCA TCCAATTCATGA GCCAATTTATTA GCAAATTGATCA CCGAATTCATCA ACAAATTAATTA TCAAATTCATTA CCCAATTTATTA TCAAATTGATAA TCAAATTGATGA GCCAATTGATAA ACTAATTCATTA TCGAATTGATTA ACGAATTTATCA ACAAATTGATGA GCGAATTCATAA ACCAATTAATTA TCTAATTTATAA ACTAATTTATGA GCTAATTTATAA ACGAATTCATGA TCCAATTTATTA ACCAATTAATTA


In [18]:
# Week 3 Quiz - Question 3

# Order the following probability distributions from lowest to highest entropy:
# A: (0.5, 0, 0, 0.5)
# B: (0.25, 0.25, 0.25, 0.25)
# C: (0, 0, 0, 1)
# D: (0.25, 0, 0.5, 0.25)

import math

def calculate_entropy(probabilities):
    entropy = 0
    for p in probabilities:
        if p != 0:
            entropy -= p * math.log2(p)
    return entropy

# Example usage:
A = [0.5, 0, 0, 0.5]
B = [0.25, 0.25, 0.25, 0.25]
C = [0, 0, 0, 1]
D = [0.25, 0, 0.5, 0.25]

entropy_A = calculate_entropy(A)
entropy_B = calculate_entropy(B)
entropy_C = calculate_entropy(C)
entropy_D = calculate_entropy(D)

print("Entropy A:", entropy_A)
print("Entropy B:", entropy_B)
print("Entropy C:", entropy_C)
print("Entropy D:", entropy_D)

# Or define the probability as distributions
distributions = {
    'A': [0.5, 0, 0, 0.5],
    'B': [0.25, 0.25, 0.25, 0.25],
    'C': [0, 0, 0, 1],
    'D': [0.25, 0, 0.5, 0.25]
}

# Calculate the entropy for each distribution
entropy_values = {}
for distribution, probabilities in distributions.items():
    entropy = calculate_entropy(probabilities)
    entropy_values[distribution] = entropy

# Sort the distributions based on entropy values
sorted_distributions = sorted(entropy_values.items(), key=lambda x: x[1])

# Display the distributions in order from lowest to highest entropy
print("Entropy distribution from lowest to highest")
for distribution, entropy in sorted_distributions:
    print(f"Distribution {distribution}: Entropy = {entropy}")

Entropy A: 1.0
Entropy B: 2.0
Entropy C: 0.0
Entropy D: 1.5
Entropy distribution from lowest to highest
Distribution C: Entropy = 0.0
Distribution A: Entropy = 1.0
Distribution D: Entropy = 1.5
Distribution B: Entropy = 2.0


In [19]:
# Week 3 Quiz - Question 4

# Consider the following profile matrix:

'''A:  0.4  0.3  0.0  0.1  0.0  0.9
C:  0.2  0.3  0.0  0.4  0.0  0.1
G:  0.1  0.3  1.0  0.1  0.5  0.0
T:  0.3  0.1  0.0  0.4  0.5  0.0'''

# Which of the following strings is a consensus string for this profile matrix?  
# (Select all that apply.)

def Consensus(profile):
    k = len(profile[0])
    consensus_list = []
    for j in range(k):
        m = 0
        frequentSymbols = []
        for i, symbol in enumerate("ACGT"):
            if profile[i][j] > m:
                m = profile[i][j]
                frequentSymbols = [symbol]
            elif profile[i][j] == m:
                frequentSymbols.append(symbol)
        consensus_list.append(''.join(frequentSymbols))
    return consensus_list


profile = [
    [0.4, 0.3, 0.0, 0.1, 0.0, 0.9],
    [0.2, 0.3, 0.0, 0.4, 0.0, 0.1],
    [0.1, 0.3, 1.0, 0.1, 0.5, 0.0],
    [0.3, 0.1, 0.0, 0.4, 0.5, 0.0]
]

print('Consensus String Options by Position =', Consensus(profile))

# Returns AAGCGA, which is not an option for quiz.
# I believe AAGTGA, ACGCGA and ACGTTA are valid consensus strings

'''options = [
    "AAGTGA",
    "ACGCGA",
    "ACGTTT",
    "ATGCTA",
    "ACGTTA",
    "AGGTCA"
]'''

options = [
    "TCGCGA",
    "AAGCCA",
    "AGGCTA",
    "ACGTTA",
    "AGGTCA",
    "ACGCGA"
]

matching_options = []
consensus_list = Consensus(profile)

for option in options:
    is_matching = True
    for consensus_chars, option_char in zip(consensus_list, option):
        if option_char not in consensus_chars:
            is_matching = False
            break
    if is_matching:
        matching_options.append(option)

print('Matching options:', matching_options)

Consensus String Options by Position = ['A', 'ACG', 'G', 'CT', 'GT', 'A']
Matching options: ['AGGCTA', 'ACGTTA', 'ACGCGA']


In [20]:
# Week 3 Quiz - Question 5

# Consider the following motif matrix:

# CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC 
# GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC 
# GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG

# Which of the following 7-mers is a median string for this motif matrix? (Select all that apply.)

def MedianString(genome, k):
    distance = float('inf')
    median = []

    # Iterate through all possible k-mers
    for i in range(len(genome[0]) - k + 1):
        pattern = genome[0][i:i+k]
        cur_distance = 0

        # Calculate the total Hamming distance for the current pattern
        for j in range(len(genome)):
            seq = genome[j]
            min_distance = float('inf')

            # Calculate the minimum Hamming distance to the current sequence
            for l in range(len(seq) - k + 1):
                kmer = seq[l:l+k]
                hamming_dist = sum(base1 != base2 for base1, base2 in zip(pattern, kmer))
                min_distance = min(min_distance, hamming_dist)

            cur_distance += min_distance

        if cur_distance < distance:
            distance = cur_distance
            median = [pattern]
        elif cur_distance == distance:
            median.append(pattern)

    return median


k    = 7
# genome = 'CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG'
genome = [
    'CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC', 
    'GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC', 
    'GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG'
]

print("Question 5 Median String =", MedianString(genome, k))

Question 5 Median String = ['GTAGGAA', 'TAGTTTC', 'GAACCAC', 'AATCCTA']


In [21]:
# Week 3 Quiz - Question 6

# Consider the following profile matrix Profile:

# A:  0.4  0.3  0.0  0.1  0.0  0.9
# C:  0.2  0.3  0.0  0.4  0.0  0.1
# G:  0.1  0.3  1.0  0.1  0.5  0.0
# T:  0.3  0.1  0.0  0.4  0.5  0.0

# Compute Pr(AAGTTC|Profile). (Express your answer as a decimal and do not round your answer.)

profile = {
    'A': [0.4, 0.3, 0.0, 0.1, 0.0, 0.9],
    'C': [0.2, 0.3, 0.0, 0.4, 0.0, 0.1],
    'G': [0.1, 0.3, 1.0, 0.1, 0.5, 0.0],
    'T': [0.3, 0.1, 0.0, 0.4, 0.5, 0.0]
}

print('Compute Pr =', ComputePr('AAGTTC', profile))

print('Compute Pr =', ComputePr('GAGCTA', profile))

Compute Pr = 0.0024000000000000002
Compute Pr = 0.0054
