In [50]:
from copy import deepcopy
from itertools import product

In [3]:
def hamming_distance(kmer1, kmer2):
    distance = 0
    for c1, c2 in zip(kmer1,kmer2):
        if c1!=c2:
            distance += 1
    return distance

In [4]:
assert hamming_distance("GGGCCGTTGGT","GGACCGTTGAC") == 3

In [163]:
hamming_distance("CTACAGCAATACGATCATATGCGGATCCGCAGTGGCCGGTAGACACACGT",
                "CTACCCCGCTGCTCAATGACCGGGACTAAAGAGGCGAAGATTATGGTGTG")

36

In [6]:
with open("datasets/dataset_9_3.txt","r") as fin:
    kmer1, kmer2 = fin.read().strip().split("\n")
    print(hamming_distance(kmer1,kmer2))

906


In [7]:
def approximate_matching(pattern, genome, distance):
    indices = []
    for i in range(len(genome)-len(pattern)+1):
        kmer = genome[i:i+len(pattern)]
        if hamming_distance(pattern,kmer) <= distance:
            indices.append(i)
    return indices

In [8]:
assert approximate_matching(
    "ATTCTGGA",
    "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT",
    3
) == [6, 7, 26, 27]

In [13]:
with open("datasets/dataset_9_4.txt","r") as fin:
    pattern, genome, distance = fin.read().strip().split("\n")
    print(" ".join(map(str,approximate_matching(pattern, genome, int(distance)))))


361 385 656 849 1013 1089 1507 1572 1613 2146 2387 2746 2951 3079 3378 3800 3817 3835 3938 4083 4159 4320 4575 5111 5145 5192 5193 5389 6271 6555 6657 7056 8125 8821 9388 9531 9599 9692 9820 9967 10749 11233 11358 12288 12356 13170 13930 14561 14891 15431 15432 16111 16146 16147 16461 16953 17679 18359


In [11]:
approximate_matching(
    "ATTCTGGA",
    "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAAT",
    3
)

[6, 7, 26, 27]

In [14]:
def approximate_count(pattern, genome, distance):
    return len(approximate_matching(pattern, genome, distance))

In [15]:
approximate_count("AAAAA", "AACAAGCTGATAAACATTTAAAGAG", 2)

11

In [164]:
approximate_count("AA", "TACGCATTACAAAGCACA", 1)

13

In [17]:
with open("datasets/dataset_9_6.txt","r") as fin:
    pattern, genome, distance = fin.read().strip().split("\n")
    print(approximate_count(pattern, genome, int(distance)))


22


In [160]:
nucleotides = "ATGC"

def kmer_neighbors(kmer, distance):
    if distance == 0:
        return {kmer}
    
    if len(kmer) == 1:
        return {"A", "T", "G", "C"}
    
    neighbors = dict()
    suffix = kmer[1:]
    suffix_neighbors = kmer_neighbors(suffix, distance)
    for neighbor in suffix_neighbors:
        if hamming_distance(suffix, neighbor) < distance:
            for nucleotide in nucleotides:
                neighbors[nucleotide + neighbor] = 0
        else:
             neighbors[kmer[0] + neighbor] = 0
    
    return neighbors

In [162]:
expected = set(["ACG","CCG","GCG","TCG","AAG","AGG","ATG","ACA","ACC","ACT"])
assert set(kmer_neighbors("ACG",1).keys()) == expected

In [165]:
len(sorted(kmer_neighbors("ACGT",3).keys()))

175

In [77]:
def neighbors(genome, k, distance):
    all_neighbors = dict()
    for i in range(len(genome)-k+1):
        all_neighbors.update(kmer_neighbors(genome[i:i+k], distance))
    return all_neighbors

In [87]:
def approximate_max_frequency(genome, k, distance):
    kmer_count = neighbors(genome, k, distance)
    
    for pattern, _ in kmer_count.items():
        kmer_count[pattern] += approximate_count(pattern, genome, distance)
    
    kmer_count = sorted(kmer_count.items(), key=lambda x: x[1], reverse=True)
    
    max_kmers = [kmer_count[0][0]]
    max_count = kmer_count[0][1]
    for kmer, count in kmer_count[1:]:
        if count == max_count:
            max_kmers.append(kmer)
    
    return max_kmers

In [91]:
assert set(approximate_max_frequency("ACGTTGCATGTCGCATGATGCATGAGAGCT",4,1)) == set(["GATG", "ATGC","ATGT"])

In [94]:
with open("datasets/dataset_9_7.txt","r") as fin:
    genome, k_distance = fin.read().strip().split("\n")
    k, distance = k_distance.split(" ")
    print(" ".join(approximate_max_frequency(genome, int(k), int(distance))))

GGGGGG


In [113]:
complement_map = {
    "A": "T",
    "G": "C",
    "T": "A",
    "C": "G"
}

def reverse_complement(dna_string):
    reverse = list()
    for nucleotide in dna_string[::-1]:
        reverse.append(complement_map[nucleotide])
    return "".join(reverse)

def approximate_max_frequency_rc(genome, k, distance):
    kmer_count = neighbors(genome, k, distance)
    
    kmer_complements = dict()
    for kmer, count in kmer_count.items():
        rc = reverse_complement(kmer)
        kmer_complements[kmer] = rc
    
    for pattern, _ in kmer_count.items():
        rc = kmer_complements.get(pattern)
        kmer_count[pattern] += (
            approximate_count(pattern, genome, distance) +
            approximate_count(rc, genome, distance) if rc!=None else 0
        )
    
    kmer_count = sorted(kmer_count.items(), key=lambda x: x[1], reverse=True)
    
    max_kmers = [kmer_count[0][0]]
    max_count = kmer_count[0][1]
    for kmer, count in kmer_count[1:]:
        if count == max_count:
            max_kmers.append(kmer)
    
    return set(max_kmers)

In [114]:
assert approximate_max_frequency_rc("ACGTTGCATGTCGCATGATGCATGAGAGCT",4,1) == set(["ATGT","ACAT"])

In [115]:
with open("datasets/dataset_9_8.txt","r") as fin:
    genome, k_distance = fin.read().strip().split("\n")
    k, distance = k_distance.split(" ")
    print(" ".join(approximate_max_frequency_rc(genome, int(k), int(distance))))

CGCGGCC CGGCGCC GGCCGCG GGCGCCG


### for salmonella

In [121]:
with open("datasets/salmonella.txt","r") as fin:
    s = "".join(fin.read().split("\n"))
    
minima = [3764856, 3764858]
lb, up = 3764857-500, 3764856+500
oriC = s[lb:up+1]

In [122]:
approximate_max_frequency_rc(oriC,9,1)

{'TGTGGATAA', 'TTATCCACA'}