In [29]:
from collections import defaultdict
from copy import deepcopy

import common.bwt as b

In [72]:
def get_symbol_credits(last, top, bottom, symbol, d):
    symbol_credits = {}
    for s in last[top:bottom+1]:
        if s[0] == symbol:
            symbol_credits[s[0]] = d
        elif d > 0:
            symbol_credits[s[0]] = d-1
    return symbol_credits

def bwt_matching_positions(suffix_array, first_occurence, last, pattern_, partial_count, C, d, top_=0, bottom_=-10, next_symbol=None):
    if bottom_ == -10:
        bottom_ = len(last)-1
        
    pattern = deepcopy(pattern_)

    if top_ <= bottom_:
        if len(pattern) > 0:
            symbol = pattern[-1]
            pattern = pattern[:-1]
            symbol_credits = get_symbol_credits(last, top_, bottom_, symbol, d)
            #print(f"symbol_credits pattern {pattern_}: {symbol_credits}")
            for next_symbol, credit in symbol_credits.items():
                top = first_occurence[next_symbol] + b.get_count_symbol(next_symbol, top_, last, partial_count, C)
                bottom = first_occurence[next_symbol] + b.get_count_symbol(next_symbol, bottom_+1, last, partial_count, C) -1
                #print(f"pattern {pattern_}, symbol {symbol}, next_symbol {next_symbol}, credits {credit}, top {top}, bottom {bottom}")
                yield from bwt_matching_positions(
                    suffix_array, first_occurence, last, pattern, partial_count, C, credit, top, bottom, next_symbol)
        else:
            suffixes = b.lookup_suffixes(next_symbol, suffix_array, last, first_occurence, top_, bottom_)
            #print(f"found pattern with last first symbol {next_symbol} from {top_} to {bottom_}")
            #print(f"suffixes {suffixes}")
            yield from suffixes

def bwt_matching(text, patterns, C=5, d=1):
    text += "$"
    bwt_text = b.bwt(text)
    partial_counts = b.get_counts(bwt_text, C)
    first_occurence = b.get_first_occurence(bwt_text)
    print(f"first occurence {first_occurence}")
    partial_suffix_array = b.get_partial_suffix_array(bwt_text, first_occurence, C)
    found = defaultdict(list)
    p5 = int(0.05*len(patterns))+1
    for i, pattern in enumerate(patterns):
        for match_pos in bwt_matching_positions(partial_suffix_array, first_occurence, bwt_text, pattern, partial_counts, C, d):
            found[pattern].append(match_pos)
        if i % p5 == 0:
            print(f"pattern {i} of {len(patterns)} processed")
    return found

In [73]:
text = "ACATGCTACTTT"
patterns = ["ATT", "GCC", "GCTA", "TATT"]
#patterns = ["GCTA"]
d = 1

In [74]:
b.print_matches(bwt_matching(text, patterns, d=d))

first occurence {'$': 0, 'A': 1, 'C': 4, 'G': 7, 'T': 8}
suffix array [('$1', 12), ('A1', 0), ('A2', 7), ('A3', 2), ('C1', 1), ('C2', 5), ('C3', 8), ('G1', 4), ('T1', 11), ('T2', 6), ('T3', 3), ('T4', 10), ('T5', 9)]
pattern 0 of 4 processed
pattern 1 of 4 processed
pattern 2 of 4 processed
pattern 3 of 4 processed
ATT: 2 7 8 9
GCC: 4
GCTA: 4
TATT: 6


In [75]:
with open("../data/dataset_304_10.txt", "r") as fin:
    lines = [line.strip() for line in fin]
    text = lines[0]
    patterns = lines[1].split(" ")
    d = int(lines[2])
    b.print_matches(bwt_matching(text, patterns, C=5, d=d))

first occurence {'$': 0, 'A': 1, 'C': 2416, 'G': 4960, 'T': 7571}
suffix array [('$1', 10000), ('A1', 4181), ('A10', 2787), ('A100', 8604), ('A1000', 7592), ('A1001', 3940), ('A1002', 1704), ('A1003', 592), ('A1004', 3742), ('A1005', 6274), ('A1006', 4572), ('A1007', 1524), ('A1008', 387), ('A1009', 1636), ('A101', 4185), ('A1010', 5805), ('A1011', 1679), ('A1012', 2953), ('A1013', 5138), ('A1014', 3648), ('A1015', 194), ('A1016', 3848), ('A1017', 8033), ('A1018', 5286), ('A1019', 5368), ('A102', 8026), ('A1020', 8038), ('A1021', 4661), ('A1022', 7002), ('A1023', 7991), ('A1024', 4613), ('A1025', 7933), ('A1026', 6236), ('A1027', 4701), ('A1028', 4850), ('A1029', 9201), ('A103', 8179), ('A1030', 3205), ('A1031', 9308), ('A1032', 5166), ('A1033', 6084), ('A1034', 3090), ('A1035', 4296), ('A1036', 8934), ('A1037', 1543), ('A1038', 2238), ('A1039', 6876), ('A104', 4046), ('A1040', 6296), ('A1041', 6918), ('A1042', 9219), ('A1043', 7213), ('A1044', 9509), ('A1045', 6928), ('A1046', 3351), 

pattern 101 of 2000 processed
pattern 202 of 2000 processed
pattern 303 of 2000 processed
pattern 404 of 2000 processed
pattern 505 of 2000 processed
pattern 606 of 2000 processed
pattern 707 of 2000 processed
pattern 808 of 2000 processed
pattern 909 of 2000 processed
pattern 1010 of 2000 processed
pattern 1111 of 2000 processed
pattern 1212 of 2000 processed
pattern 1313 of 2000 processed
pattern 1414 of 2000 processed
pattern 1515 of 2000 processed
pattern 1616 of 2000 processed
pattern 1717 of 2000 processed
pattern 1818 of 2000 processed
pattern 1919 of 2000 processed
TGATCCCCGCTACGTGCTCACTTTTTACAACTAATTTTGGCTACCTACCACTGGA: 9190
TATGCGCCGGCTCTTTAATCATGTTTCATCTGCTTTCTATCAGTTAAACGGGCCGACCGTTGC: 2165
TTTGATATTACCAGCAAATGACGGTCAACCATGAATGATGCGAGTACCCATAGGGTCTCAGAGCGTGC: 3920
AGACCGTCGACGAAACGAATTTGGACGAAGATAGCAAGGGCAATATTGATCGTTGGGAGCGAGCGGTT: 5576
AGGGTGACAGAGCGTGCCACGGGGATATGGTAAATTGCAGTCATAGGTTCATTAGTATT: 3971
CGTTGCACCCAATGAGACTGGTATAGGGCAGTTAACTTAGCAACTGTCCAGCTGGC: 6486
TTTCGATGG

TAGTCCAGCTTAAATATTACTGGCGGGCACGGCCTATTGTTATGCCCCCCTCCCTGCAACACGG: 5074
CGTCCTTGGCAAAGGCAGTACGGTACATAAGCTCGGAAGCGCCCAACTAGTTTCGAGA: 8837
TCCATTATTGGATCCAGAACGCCATGAGTGACCTAGGCACACGCGTCCTTCCCCGTTTTGGAACGAAAG: 9846
TGGAGCAAATCCCAGCCATTACTGTAAACTGCACGACACATGGTCCACTATCGTGAAGTCTTG: 8598
CCTGCATACGGACGGAACGCTACCCTCACTGTCAGCGGAAATTTCCTAGCCTAAGGATATT: 440
CCTCCGCCCAGATGGGCATCGCCACGTGGTCGGATAGGGCGCTAGCTCCTTTAGTACACGCAC: 9285
AGTGGGAGCAAATGGCTTAGGTTGTCGTCATAGGTTAAACTTGGATGTAGCATTTGAGCGATACGC: 1410
GGGCTGGTGCCGGCTATTACTGGAATTAACGAGCTAAACCACCCTCGTCCCGAAGAGAACAAGACAAC: 3381
TCTGTGCCTTAGGACCCGTATGACCACCCTACGATCTGCCCCACGGGGTAGAGCAACAGCCGGA: 1247
GTTCTTGTTCGGTATGTTACAATGGGCAGAAGATGCTCTGCATACCGCGACAGAATA: 7584
CCGGAGGTGTACAGGTGGCGTGGCGGCGACTTACTTAATTACAGGAACTTTGGCCATCGG: 7481
CGGGCACGGCCTATTGTTATGCCCCCCTCCCTGCTACACGGACGTAGGGTTACACCG: 5097
CCACGGGGTTCCAAATTAGGCTGAATCCGTATCTTGTCACAAGTGATCACTGTACAACGGTGGTAT: 3686
GAAAATTTCGTATGCAACTCGATGTGGCCAGGCTGCAAACACTATCCCCAAGCACA: 9074
AGACGCATAAAATTTGGTAAGCGTCCCTTTC

In [76]:
#quiz

In [77]:
last = b.bwt("banana$")

In [78]:
print(last)

['a1', 'n1', 'n2', 'b1', '$1', 'a2', 'a3']


In [81]:
first = b.get_first_occurence(last)

In [82]:
b.get_partial_suffix_array(last, first, C=1)

suffix array [('$1', 6), ('a1', 5), ('a2', 3), ('a3', 1), ('b1', 0), ('n1', 4), ('n2', 2)]


{'$1': 6, 'a1': 5, 'a2': 3, 'a3': 1, 'b1': 0, 'n1': 4, 'n2': 2}

In [83]:
b.bwt("ACCAACACTG$")

['G1', 'C1', 'A1', '$1', 'C2', 'C3', 'A2', 'A3', 'A4', 'T1', 'C4']