In [1]:
''' 
Naive BWT using the BWM. 
'''
def rotations(t):
    ''' Returns a list of rotations of input string t'''
    tt = t*2 
    return [ tt[i:i+len(t)] for i in range(0,len(t))]

def bwm(t):
    ''' Return lexicographically sorted list of t's rotation '''
    return sorted(rotations(t)) ## implicitly using suffix arrays 

def bwtViaBwm(t):
    ''' Given T, returns BWT(T) by taking the last column of BWM '''
    t = t + "$"
    return ''.join(map(lambda x: x[-1], bwm(t)))

print(bwtViaBwm("wooloomooloo"))
print(bwtViaBwm("cat"))

oooooooolmwl$
tc$a


### Sorted Suffix Array & Burrows Wheel Transform 

In [2]:
def suffixArray(s): 
    ''' Returns the BWT for s using the suffix array '''
    # s = s + "$"
    ## First construct the suffix array
    unsorted_SA = [s[i:len(s)] for i in range(len(s)-1, -1, -1)]
    unsorted_SA.reverse() 

    suffixes  = list(zip(range(0, len(s)), (unsorted_SA)))
    #print(suffixes)
    suffixes.sort(key = (lambda x: x[1]))
    #print(suffixes)
    indices = [sa_tuple[0] for sa_tuple in suffixes] ## getting just the index
    suffixes = [sa_tuple[1] for sa_tuple in suffixes]
    return suffixes, indices

print(suffixArray("cat$"))

def bwtViaSA(s):
    ## construct the suffix array
    n = len(s)
    suffixes, indices = suffixArray(s)
    bwt = [None for i in range(len(s))]
    #bwt[i] = S[(indices[i] - 1) % n]

    ## construct the BWT from the suffix array 
    for i in range(len(suffixes)):
        bwt[i] = s[(indices[i] - 1) % n] ## NOT EFFICIENT

        ## BETTER METHOD : AS INCREMENTALLY FROM POSITION 1 .... FIND THE RIGHT SPOT
        # if [i] == 0:
        #     bwt[i] = "$"
        # else:
        #     bwt[i] = s[s.find(suffixes[i])-1]
    return "".join(bwt)

print(bwtViaSA("mississippi$"))
print("ipssm$pissii")


(['$', 'at$', 'cat$', 't$'], [3, 1, 0, 2])
ipssm$pissii
ipssm$pissisi


#### Constructing the Run Length Encoding

In [1]:
def encode_bwt(bwt_string):
    # Check for empty input
    if not bwt_string:
        return []

    # Initialize the list to store tuples (ASCII code; count)
    encoded_list = []
    
    # Initialize the first character and its count
    current_char = bwt_string[0]
    count = 1

    # Iterate over the BWT string starting from the second character
    for char in bwt_string[1:]:
        if char == current_char:
            # Increment the count if the same character continues
            count += 1
        else:
            # Append the tuple for the previous character run
            encoded_list.append((ord(current_char), count))
            # Reset for the new character
            current_char = char
            count = 1

    # Don't forget to append the last run
    encoded_list.append((ord(current_char), count))

    return encoded_list

# Example usage
bwt_string = "nnbaa$aa"
encoded_bwt = encode_bwt(bwt_string)
print(encoded_bwt)


[(110, 2), (98, 1), (97, 2), (36, 1), (97, 2)]


### Inverting BWT Efficiently (no in memory storage)

In [6]:
def invert_bwt(bwt, char_count):
    # Step 1: Count the frequency of each character
    # n = 0
    # char_count = {}
    # for char in bwt:
    #     n += 1  # Calculate length as we go
    #     if char in char_count:
    #         char_count[char] = char_count[char] + 1
    #     else:
    #         char_count[char] = 1

    # Step 2: Calculate cumulative character counts for the first column
    sorted_chars = sorted(char_count.keys())  # O(n*log(n)), n bounded by ASCII range --> O(1)
    total = 0
    first_occurrence = {}
    for char in sorted_chars:
        first_occurrence[char] = total
        total += char_count[char]

    # Step 3: Set up the mapping from first column to last column
    T = [0] * len(bwt)
    count = {char: 0 for char in char_count}
    for i in range(len(bwt)): 
        char = bwt[i]
        T[first_occurrence[char] + count[char]] = i
        count[char] += 1

    # Step 4: Reconstruct the original string by following the map
    row = T[first_occurrence['$']]  # Start with the position of the end-of-string character `$`
    row = T[row]  # Row points to character after $ --> first character in reconstructed string
    for _ in range(len(bwt)):
        yield bwt[row] ## Change this to print on same line 
        row = T[row]

# print("".join(char for char in invert_bwt("lo$oogg")))  
char_count = {'a': 3, 'n': 2, 'b': 1, '$': 1}
print("".join(char for char in invert_bwt("annb$aa",char_count)))


banana$


In [7]:
print(len("0001110001001100001101100010011110110111001010"))

46


In [8]:
print(chr(int("0100100", 2)))

$
