In [42]:
import bisect

In [24]:
class Index(object):
    def __init__(self, text : str, k_mer : int):
        self.k_mer = k_mer
        self.indices = []
        for i in range(len(text) - self.k_mer + 1):
            self.indices.append((text[i:i+self.k_mer], i))
        self.indices.sort()

    def query(self, pattern):
        k_mer_str = pattern[:self.k_mer]
        i = bisect.bisect_left(self.indices, (k_mer_str, -1))
        hits = []
        while i < len(self.indices):
            if self.indices[i][0] != k_mer_str:
                break
            hit = self.indices[i][1]
            hits.append(hit)
            i += 1
        return hits
    

In [28]:
def query_index(pattern, text, index_obj : Index):
    k_mer = index_obj.k_mer
    offsets = []
    for indices in index_obj.query(pattern):
        if pattern[k_mer:] == text[indices + k_mer : indices + len(pattern)]:
            offsets.append(indices)
    return offsets

In [29]:
text = "ATCATCTTAACCTTTCA"
pattern = "ATC"
index_object = Index(text, 2)
print(query_index(pattern, text, index_object))

[0, 3]


In [50]:
# variants of k_mer
class IndexPerNth(object):
    def __init__(self, text : str, k_mer : int, nth):
        self.nth = nth
        self.k_mer = k_mer
        self.indices = []
        for i in range(0, len(text) - self.k_mer + 1, nth):
            self.indices.append((text[i:i+self.k_mer], i))
        self.indices.sort()

    def query(self, pattern):
        hits = []
        for k in range(self.nth):
            k_mer_str = pattern[k:k+self.k_mer]
            i = bisect.bisect_left(self.indices, (k_mer_str, -1))
            while i < len(self.indices):
                if self.indices[i][0] != k_mer_str:
                    break
                hit = self.indices[i][1] - k
                hits.append(hit)
                i += 1
        return hits

In [59]:

def query_index_nth(pattern, text, index_obj : IndexPerNth):
    k_mer = index_obj.k_mer
    offsets = []
    for indices in index_obj.query(pattern):
        if pattern == text[indices : indices + len(pattern)]:
            offsets.append(indices)
        else:
            for i in range(1, index_obj.nth):
                if pattern[:i] == text[indices:indices+i] and pattern[i:] == text[indices + i : indices + len(pattern)]:
                    offsets.append(indices)
    return offsets

In [60]:
"""
def query_index_nth(pattern, text, index_obj: IndexPerNth):
    k_mer = index_obj.k_mer
    nth = index_obj.nth
    offsets = []

    for index in index_obj.query(pattern):
        # Check full match directly
        if pattern == text[index:index + len(pattern)]:
            offsets.append(index)
        else:
            # Handle cases where pattern might partially overlap
            for i in range(1, nth):
                if pattern[:i] == text[index:index + i] and pattern[i:] == text[index + i:index + len(pattern)]:
                    offsets.append(index)
                    #break  # No need to check further for this index

    return offsets
"""

'\ndef query_index_nth(pattern, text, index_obj: IndexPerNth):\n    k_mer = index_obj.k_mer\n    nth = index_obj.nth\n    offsets = []\n\n    for index in index_obj.query(pattern):\n        # Check full match directly\n        if pattern == text[index:index + len(pattern)]:\n            offsets.append(index)\n        else:\n            # Handle cases where pattern might partially overlap\n            for i in range(1, nth):\n                if pattern[:i] == text[index:index + i] and pattern[i:] == text[index + i:index + len(pattern)]:\n                    offsets.append(index)\n                    #break  # No need to check further for this index\n\n    return offsets\n'

In [61]:
text = "ATCATCTTAACCTTTCA"
pattern = "ATC"
index_object = IndexPerNth(text, 2, 2)
print(query_index_nth(pattern, text, index_object))

[0, 3]
