In [9]:
from collections import defaultdict

In [18]:
complement_map = {
    "A": "T",
    "G": "C",
    "T": "A",
    "C": "G"
}

def reverse_complement(dna_string):
    reverse = list()
    for nucleotide in dna_string[::-1]:
        reverse.append(complement_map[nucleotide])
    return "".join(reverse)

def shared_kmers(k, genome1, genome2):
    shared = []
    kmer2index = defaultdict(set)
    for i in range(len(genome1)-k+1):
        kmer2index[genome1[i:i+k]].add(i)
    for j in range(len(genome2)-k+1):
        i = kmer2index.get(genome2[j:j+k])
        if i is not None:
            for ii in i:
                shared.append((ii, j))
                
    kmer2index = defaultdict(set)
    r = reverse_complement(genome1)
    for i in range(len(r)-k+1):
        kmer2index[r[i:i+k]].add(len(r)-i-k)    
    for j in range(len(genome2)-k+1):
        i = kmer2index.get(genome2[j:j+k])
        if i is not None:
            for ii in i:
                shared.append((ii, j))
            
    return shared

In [19]:
shared_kmers(3, "AAACTCATC", "TTTCAAATC")

[(4, 2), (0, 4), (6, 6), (0, 0)]

In [17]:
reverse_complement("AAACTCATC")

'GATGAGTTT'

In [20]:
with open("../data/dataset_289_5.txt","r") as fin:
    lines = [line.strip() for line in fin]
    k = int(lines[0])
    genome1, genome2 = lines[1:]
    for s in shared_kmers(k, genome1, genome2):
        print(s)

(19694, 73)
(19652, 73)
(8486, 73)
(183, 123)
(184, 124)
(6487, 167)
(6488, 168)
(7669, 168)
(5822, 168)
(15281, 195)
(15282, 196)
(7883, 196)
(2813, 196)
(15283, 197)
(10281, 290)
(1011, 290)
(18505, 405)
(8999, 405)
(18506, 406)
(18507, 407)
(19808, 745)
(17679, 745)
(17680, 746)
(17681, 747)
(17682, 748)
(17683, 749)
(6199, 793)
(7255, 895)
(7256, 896)
(17701, 896)
(2581, 896)
(17702, 897)
(19694, 1013)
(19652, 1013)
(8486, 1013)
(13281, 1048)
(1574, 1048)
(9319, 1125)
(9320, 1126)
(2417, 1126)
(16559, 1126)
(9321, 1127)
(12205, 1172)
(12206, 1173)
(7169, 1174)
(12207, 1174)
(12208, 1175)
(7170, 1175)
(14318, 1175)
(12209, 1176)
(14319, 1176)
(12210, 1177)
(6035, 1194)
(6036, 1195)
(8760, 1196)
(15811, 1196)
(6037, 1196)
(7882, 1254)
(15282, 1255)
(7883, 1255)
(2813, 1255)
(17801, 1297)
(1775, 1371)
(1776, 1372)
(1777, 1373)
(12108, 1373)
(1817, 1373)
(19278, 1452)
(6768, 1453)
(15140, 1453)
(19279, 1453)
(15141, 1454)
(6035, 1506)
(6036, 1507)
(8760, 1508)
(15811, 1508)
(6037, 1508

In [21]:
#quiz
shared_kmers(3,"TGCCCCGGTGGTGAG","AAGGTCGCACCTCGT")

[(9, 2), (6, 2), (0, 6), (10, 7), (7, 7), (9, 8), (6, 8), (12, 10)]