In [5]:
from itertools import product
from collections import Counter

from utilities import hamming_distance, generate_kmers, generate_all_kmers, NUCLEOTIDES

In [6]:
def find_motif(dna_string, pattern):
    k = len(pattern)
    minimum_distance = len(dna_string)
    motif = None
    for kmer in generate_kmers(dna_string, k):
        distance = hamming_distance(kmer, pattern)
        if distance < minimum_distance:
            minimum_distance = distance
            motif = kmer
    return motif, minimum_distance

def find_motifs(dna_strings, pattern):
    k = len(pattern)
    motifs = []
    distances = []
    for dna_string in dna_strings:
        motif, distance = find_motif(dna_string, pattern)
        motifs.append(motif)
        distances.append(distance)
    return motifs, distances
    
def find_median_string(k, dna_strings):
    minimum_distance = len(dna_strings)*len(dna_strings[0])
    median_pattern = ""
    for pattern in generate_all_kmers(k):
        motifs, distances = find_motifs(dna_strings, pattern)
        distance_total = sum(distances)
        if distance_total < minimum_distance:
            minimum_distance = distance_total
            median_pattern = pattern
    return median_pattern

In [7]:
dna_strings = [
    "AAATTGACGCAT",
    "GACGACCACGTT",
    "CGTCAGCGCCTG",
    "GCTGAGCACCGG",
    "AGTTCGGGACAG"
]
k = 3

In [8]:
motif, distance = find_motif(dna_strings[0], "AAA")
assert motif == "AAA"
assert distance == 0

In [9]:
assert find_median_string(k, dna_strings) == 'GAC'

In [10]:
with open("./datasets/dataset_158_9.txt") as fin:
    dna_strings = []
    for i, line in enumerate(fin):
        if i==0:
            k = int(line)
        else:
            dna_strings.append(line.strip())

In [11]:
find_median_string(k, dna_strings)

'TTAACC'

In [12]:
dna_strings = [
    "CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC",
    "GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC",
    "GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG"
]
k = 7

In [13]:
find_median_string(k, dna_strings)

'AATCCTA'

In [16]:
16*15*10/1000000

0.0024