In [None]:
import bm_preproc

In [None]:
genome = ''.join(line.strip() for line in open('chr1.GRCh38.excerpt.fasta') if not line.startswith('>'))
q1Frag = 'GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG'

In [None]:
def naiveMatches(genome, fragment):
    matches = []
    alignments = 0
    comps = 0
    for i in range(len(genome) - len(fragment) + 1):
        alignments += 1
        is_match = True
        for j in range(len(fragment)):
            is_match = genome[i+j] == fragment[j]
            comps += 1
            if not is_match:
                break
        if is_match:
            matches.append(i)
    return matches, alignments, comps


In [None]:
# Question 1
len(genome) - len(q1Frag) + 1

In [None]:
# Question 2
naiveMatches(genome, q1Frag)

In [None]:
def bmMatches(t, p, alpha='ACGT'):
    bm = bm_preproc.BoyerMoore(p, alpha)
    matches = []
    alignments = 0
    comps = 0
    i = 0
    while i < len(t) - len(p) + 1:
        shift = 1
        is_match = True
        alignments += 1
        j = len(p) - 1
        while j >= 0:
            comps += 1
            if t[i+j] != p[j]:
                is_match = False
                shift = max(1, bm.bad_character_rule(j, t[i+j]), bm.good_suffix_rule(j))
                break
            j -= 1
        if is_match:
            matches.append(i)
            shift = max(1, bm.match_skip())
        i += shift
    return matches, alignments, comps


In [None]:
naiveMatches('there would have been a time for such a word', 'word')

In [None]:
bmMatches('there would have been a time for such a word', 'word', 'abcdefghijklmnopqrstuvwxyz ')

In [None]:
bmMatches('needle need noodle needle', 'needle', 'abcdefghijklmnopqrstuvwxyz ')

In [None]:
bmMatches(genome, q1Frag)

In [None]:
import kmer_index

In [None]:
idx = kmer_index.Index(genome, 8)

In [None]:
list(enumerate('abc'))

In [None]:
def countMismatches(p,t):
    mm = 0
    for i,c in enumerate(p):
        if c != t[i]:
            mm += 1
    return mm

def kmerIndexMatches(t, p):
    assert len(p) == 24
    idx = kmer_index.Index(t, 8)
    matches = set()
    indexHits = set()
    for pIdx in [0,8,16]:
        kmer = p[pIdx : pIdx + 8]
        locs = idx.query(kmer)
        indexHits.update(locs)
        for loc in locs:
            if loc - pIdx < 0 or loc + (24 - pIdx) >= len(t):
                continue
            leftMismatches = countMismatches(p[:pIdx], t[loc-pIdx:loc])
            rightMismatches = countMismatches(p[pIdx+8:], t[loc+8:loc+24-pIdx])
            if leftMismatches + rightMismatches <= 2:
                matches.add(loc-pIdx)
    return sorted(matches), len(indexHits)


In [None]:
def isMatchAt(genome, fragment, offset, maxMismatches = 0):
    misMatches = []
    for i in range(len(fragment)):
        if genome[offset + i] != fragment[i]:
            misMatches.append(i)
            if len(misMatches) > maxMismatches:
                break
    return len(misMatches) <= maxMismatches

def getMatches(genome, fragment, maxMismatches):
    matches = []
    for i in range(len(genome) - len(fragment) + 1):
        if isMatchAt(genome, fragment, i, maxMismatches):
            matches.append(i)
    return matches

In [None]:
frag = 'GGCGCGGTGGCTCACGCCTGTAAT'
getMatches(genome, frag, 2) == sorted(kmerIndexMatches(genome, frag)[0])

In [None]:
len(kmerIndexMatches(genome, frag)[0])

In [None]:
kmerIndexMatches(genome, frag)[1]

In [None]:
import bisect
   
class SubseqIndex(object):
    """ Holds a subsequence index for a text T """
    
    def __init__(self, t, k, ival):
        """ Create index from all subsequences consisting of k characters
            spaced ival positions apart.  E.g., SubseqIndex("ATAT", 2, 2)
            extracts ("AA", 0) and ("TT", 1). """
        self.k = k  # num characters per subsequence extracted
        self.ival = ival  # space between them; 1=adjacent, 2=every other, etc
        self.index = []
        self.span = 1 + ival * (k - 1)
        for i in range(len(t) - self.span + 1):  # for each subseq
            self.index.append((t[i:i+self.span:ival], i))  # add (subseq, offset)
        self.index.sort()  # alphabetize by subseq
    
    def query(self, p):
        """ Return index hits for first subseq of p """
        subseq = p[:self.span:self.ival]  # query with first subseq
        i = bisect.bisect_left(self.index, (subseq, -1))  # binary search
        hits = []
        while i < len(self.index):  # collect matching index entries
            if self.index[i][0] != subseq:
                break
            hits.append(self.index[i][1])
            i += 1
        return hits

def kmerSubseqIndexMatches(t, p):
    assert len(p) == 24
    idx = SubseqIndex(t, 8, 3)
    matches = set()
    indexHits = set()
    for pIdx in [0,1,2]:
        kmer = p[pIdx:]
        locs = idx.query(kmer)
        indexHits.update(locs)
        for loc in locs:
            if loc - pIdx < 0 or loc + (24 - pIdx) >= len(t):
                continue
            mismatches = countMismatches(p, t[loc-pIdx:loc-pIdx+24])
            if mismatches <= 2:
                matches.add(loc-pIdx)
    return sorted(matches), len(indexHits)


In [None]:
kmerSubseqIndexMatches(genome, frag)