In [25]:
from collections import Counter

import numpy as np

from utilities import generate_kmers, NUCLEOTIDES

In [2]:
nucl2ind = {nucleotide: i for i, nucleotide in enumerate(NUCLEOTIDES)}

In [3]:
def profile_most_probable_kmer(dna_string, k, profile):
    max_prob = 0
    most_prob_kmer = dna_string[0:k]
    for kmer in generate_kmers(dna_string, k):
        prob = np.prod([profile[nucl2ind[nucl],i] for i, nucl in enumerate(kmer)])
        if prob > max_prob:
            max_prob = prob
            most_prob_kmer = kmer
    return most_prob_kmer

In [4]:
dna_string = "ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT"
k = 5
profile = np.array([
    [0.2, 0.2, 0.3, 0.2, 0.3],
    [0.4, 0.3, 0.1, 0.5, 0.1],
    [0.3, 0.3, 0.5, 0.2, 0.4],
    [0.1, 0.2, 0.1, 0.1, 0.2]
])

In [5]:
assert profile_most_probable_kmer(dna_string, k, profile) == 'CCGAG'

In [6]:
with open("./datasets/dataset_159_3.txt") as fin:
    profile = []
    for i, line in enumerate(fin):
        if i==0:
            dna_string = line.strip()
        elif i==1:
            k = int(line)
        else:
            profile.append([float(f) for f in line.strip().split(" ")])
    profile = np.array(profile)

In [7]:
profile_most_probable_kmer(dna_string, k, profile)

'CACAATACCGTACA'

In [16]:
def init_motif_matrix(dna_strings, k, t):
    motif_matrix = []
    for i in range(t):
        motif_matrix.append(dna_strings[i][:k])
    return motif_matrix


def create_profile(motifs):
    profile = np.zeros((4,len(motifs[0])))
    for coli in range(len(motifs[0])):
        for rowi in range(len(motifs)):
            i = nucl2ind[motifs[rowi][coli]]
            profile[i, coli] += 1/len(motifs)
    return profile

def create_laplace_profile(motifs):
    profile = np.ones((4,len(motifs[0])))
    for coli in range(len(motifs[0])):
        for rowi in range(len(motifs)):
            i = nucl2ind[motifs[rowi][coli]]
            profile[i, coli] += 1/len(motifs)
    return profile

def score_motifs(motifs):
    score = 0
    for coli in range(len(motifs[0])):
        c = Counter()
        for rowi in range(len(motifs)):    
            c[motifs[rowi][coli]] += 1
        most_frequent_nucl = max(c.items(), key=lambda x: x[1])[0]
        c.pop(most_frequent_nucl)
        score += sum(c.values())
    return score

In [9]:
create_profile([
    "AA",
    "AC",
    "TG"
])

array([[0.66666667, 0.33333333],
       [0.        , 0.33333333],
       [0.        , 0.33333333],
       [0.33333333, 0.        ]])

In [10]:
score_motifs([
    "AA",
    "AC",
    "TG"
])

3

In [11]:
def greedy_motif_search(dna_strings, k, profile_function):
    t = len(dna_strings)
    best_motifs = init_motif_matrix(dna_strings, k, t)
    best_score = len(dna_strings)*len(dna_strings[0])
    for kmer_motif in generate_kmers(dna_strings[0], k):
        motif_matrix = []
        motif_matrix.append(kmer_motif)
        for i in range(1, t):
            profile = profile_function(motif_matrix[0:i])
            most_prob_kmer = profile_most_probable_kmer(dna_strings[i], k, profile)
            motif_matrix.append(most_prob_kmer)
        score = score_motifs(motif_matrix)
        if score < best_score:
            best_score = score
            best_motifs = motif_matrix
    return best_motifs

In [12]:
k=3
dna_strings = [
    "GGCGTTCAGGCA",
    "AAGAATCAGTCA",
    "CAAGGAGTTCGC",
    "CACGTCAATCAC",
    "CAATAATATTCG"
]

In [13]:
assert greedy_motif_search(dna_strings, k, create_profile) == ['CAG', 'CAG', 'CAA', 'CAA', 'CAA']

['CAG', 'CAG', 'CAA', 'CAA', 'CAA']

In [14]:
with open("./datasets/dataset_159_5.txt") as fin:
    dna_strings = []
    for i, line in enumerate(fin):
        if i==0:
            k, t = line.strip().split(" ")
            k = int(k)
        else:
            dna_strings.append(line.strip())
   

In [15]:
for motif in greedy_motif_search(dna_strings, k, create_profile):
    print(motif)

TCAAATCGGGTA
CAGAAAAGTGGA
AAGCGGCTGCAC
AACGGACTTCCT
TCCCAGCGGGCA
ATGGAGAAAAAC
TCCGATCGAGGT
TCCCAACGGGAC
TCGGGGCTGCCT
TTCCGAAATGTC
TGTTCGCTCCCA
TGGCAAATGCCA
ACGCAACTAGGT
TCCGAACGTGTC
TCGAAACGTGAA
ACTGAAAAGCTA
AATGAGCTGGAC
ACCCAAAGTGGC
TGGCAGAATGGT
TCGAAGCGTGCA
TGCAATCTGGCT
TCGCAACGCGGA
TCGGATCGGGGC
TCCGAACGAGTT
TCTAAACGGGGT


In [21]:
k=3
dna_strings = [
    "GGCGTTCAGGCA",
    "AAGAATCAGTCA",
    "CAAGGAGTTCGC",
    "CACGTCAATCAC",
    "CAATAATATTCG"
]

In [22]:
assert greedy_motif_search(dna_strings, k, create_laplace_profile) == ['TTC', 'ATC', 'TTC', 'ATC', 'TTC']

In [23]:
with open("./datasets/dataset_160_9.txt") as fin:
    dna_strings = []
    for i, line in enumerate(fin):
        if i==0:
            k, t = line.strip().split(" ")
            k = int(k)
        else:
            dna_strings.append(line.strip())

In [24]:
for motif in greedy_motif_search(dna_strings, k, create_laplace_profile):
    print(motif)

CGTATATACAAT
TGTATAAACAAA
AGTATACGCAAC
TGTATACCCAAC
TGTATACCCAAT
TGTATAGGCAAG
AGTATATTCAAA
AGTATAAGCAAT
CGTATATCCAAC
GGTATAGACAAA
AGTATACCCAAC
TGTATATGCAAT
AGTATACTCAAT
TGTATAAACAAC
GGTATACGCAAG
TGTATACGCAAG
TGTATATCCAAT
GGTATACCCAAA
AGTATAGCCAAC
AGTATATACAAC
GGTATATGCAAA
TGTATATGCAAG
GGTATAACCAAA
AGTATATACAAG
TGTATAAGCAAC


In [30]:
-2*0.5*np.log(0.5)

0.6931471805599453

In [31]:
-4*0.25*np.log(0.25)

1.3862943611198906

In [32]:
-2*0.25*np.log(0.25)-0.5*np.log(0.5)

1.0397207708399179

In [35]:
4*3*10*4*5*1/1000000

0.0024