In [None]:
import numpy as np

# --- Suffix Tree Implementation ---
class SuffixTreeNode:
    def __init__(self):
        self.children = {}  # key: char, value: (start, length, child node)
        self.suffix_index = -1  # only for leaf nodes

class SuffixTree:
    def __init__(self, text):
        self.text = text
        self.root = SuffixTreeNode()
        self.build_suffix_tree()

    def build_suffix_tree(self):
        n = len(self.text)
        for i in range(n):  # insert all suffixes
            current = self.root
            j = i
            while j < n:
                c = self.text[j]
                if c not in current.children:
                    # create new edge from j to end of string
                    child = SuffixTreeNode()
                    current.children[c] = (j, n - j, child)
                    child.suffix_index = i
                    break
                else:
                    start, length, child = current.children[c]
                    k = 0
                    while k < length and j + k < n and self.text[start + k] == self.text[j + k]:
                        k += 1
                    if k == length:
                        current = child
                        j += k
                    else:
                        # split the edge
                        split = SuffixTreeNode()
                        current.children[c] = (start, k, split)
                        split.children[self.text[start + k]] = (start + k, length - k, child)
                        new_leaf = SuffixTreeNode()
                        split.children[self.text[j + k]] = (j + k, n - (j + k), new_leaf)
                        new_leaf.suffix_index = i
                        break

# --- Suffix Array from Suffix Tree ---
def get_suffix_array_from_tree(suffix_tree):
    result = []

    def dfs(node):
        if node.suffix_index != -1:
            result.append(node.suffix_index)
        for key in sorted(node.children.keys()):
            _, _, child = node.children[key]
            dfs(child)

    dfs(suffix_tree.root)
    return result

# --- Burrows-Wheeler Transform (BWT) ---
def compute_bwt(text, suffix_array):
    text = text + "$"
    bwt = ""
    for i in suffix_array:
        bwt += text[i - 1] if i != 0 else "$"
    return bwt

# --- C Table: First column of sorted BWT matrix ---
def compute_c_table(bwt):
    sorted_bwt = sorted(bwt)
    unique_chars = sorted(set(bwt))
    c_table = {char: sorted_bwt.index(char) for char in unique_chars}
    return c_table

# --- OCC Table: Cumulative counts of characters at each position ---
def compute_occ_table(bwt):
    unique_chars = sorted(set(bwt))
    occ_table = np.zeros((len(unique_chars), len(bwt) + 1), dtype=int)
    char_to_index = {char: i for i, char in enumerate(unique_chars)}

    for i in range(1, len(bwt) + 1):
        occ_table[:, i] = occ_table[:, i - 1]
        char = bwt[i - 1]
        occ_table[char_to_index[char], i] += 1

    return occ_table, char_to_index

# --- FM-index Backward Search ---
def backward_search(pattern, c_table, occ_table, char_to_index, bwt):
    s, e = 1, len(bwt)
    for char in reversed(pattern):
        if char not in char_to_index:
            return []
        idx = char_to_index[char]
        s = c_table[char] + occ_table[idx, s - 1] + 1
        e = c_table[char] + occ_table[idx, e]
        if s > e:
            return []
    return list(range(s - 1, e))

# --- Main Program ---
if __name__ == "__main__":
    reference_genome = "abracadabracadkff"

    # Step 1: Build Suffix Tree and get Suffix Array
    tree = SuffixTree(reference_genome + "$")
    suffix_array = get_suffix_array_from_tree(tree)
    bwt_string = compute_bwt(reference_genome, suffix_array)
    c_table = compute_c_table(bwt_string)
    occ_table, char_to_index = compute_occ_table(bwt_string)

    # Step 2: Print FM-index details
    print("Suffix Array:", suffix_array)
    print("BWT String:", bwt_string)
    print("C Table:", c_table)
    print("Occurrence Table:")
    for char, idx in char_to_index.items():
        print(f"{char}: {occ_table[idx]}")

    # Step 3: Seed-based search and reconstruction
    seeds = ["dda", "cad", "brd", "rac", "add"]
    seed_len = 3
    reconstruction = ['-' for _ in range(len(reference_genome))]

    print("\n Performing seed-based search and reconstruction:")
    for seed in seeds:
        print(f"\n Seed: '{seed}'")
        sa_indices = backward_search(seed, c_table, occ_table, char_to_index, bwt_string)
        positions = [suffix_array[i] for i in sa_indices]
        print(f" Found at positions: {positions}")

        for pos in positions:
            if pos + seed_len <= len(reference_genome):
                for j in range(seed_len):
                    reconstruction[pos + j] = seed[j]

    reconstructed_query = ''.join(reconstruction)
    print("\n Reconstructed Read Skeleton from Seeds:")
    print(reconstructed_query)


Suffix Array: [17, 0, 7, 3, 10, 5, 12, 1, 8, 4, 11, 6, 13, 16, 15, 14, 2, 9]
BWT String: f$drrccaaaaaafkdbb
C Table: {'$': 0, 'a': 1, 'b': 7, 'c': 9, 'd': 11, 'f': 13, 'k': 15, 'r': 16}
Occurrence Table:
$: [0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
a: [0 0 0 0 0 0 0 0 1 2 3 4 5 6 6 6 6 6 6]
b: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2]
c: [0 0 0 0 0 0 1 2 2 2 2 2 2 2 2 2 2 2 2]
d: [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2]
f: [0 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2]
k: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1]
r: [0 0 0 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2]

 Performing seed-based search and reconstruction:

 Seed: 'dda'
 Found at positions: []

 Seed: 'cad'
 Found at positions: [4, 11]

 Seed: 'brd'
 Found at positions: []

 Seed: 'rac'
 Found at positions: [2, 9]

 Seed: 'add'
 Found at positions: []

 Reconstructed Read Skeleton from Seeds:
--racad--racad---


In [None]:
import numpy as np
from collections import Counter

# --- Suffix Tree Implementation ---
class SuffixTreeNode:
    def __init__(self):
        self.children = {}  # key: char, value: (start, length, child node)
        self.suffix_index = -1  # only for leaf nodes

class SuffixTree:
    def __init__(self, text):
        self.text = text
        self.root = SuffixTreeNode()
        self.build_suffix_tree()

    def build_suffix_tree(self):
        n = len(self.text)
        for i in range(n):  # insert all suffixes
            current = self.root
            j = i
            while j < n:
                c = self.text[j]
                if c not in current.children:
                    # create new edge from j to end of string
                    child = SuffixTreeNode()
                    current.children[c] = (j, n - j, child)
                    child.suffix_index = i
                    break
                else:
                    start, length, child = current.children[c]
                    k = 0
                    while k < length and j + k < n and self.text[start + k] == self.text[j + k]:
                        k += 1
                    if k == length:
                        current = child
                        j += k
                    else:
                        # split the edge
                        split = SuffixTreeNode()
                        current.children[c] = (start, k, split)
                        split.children[self.text[start + k]] = (start + k, length - k, child)
                        new_leaf = SuffixTreeNode()
                        split.children[self.text[j + k]] = (j + k, n - (j + k), new_leaf)
                        new_leaf.suffix_index = i
                        break

#  Suffix Array from Suffix Tree
def get_suffix_array_from_tree(suffix_tree):
    result = []

    def dfs(node):
        if node.suffix_index != -1:
            result.append(node.suffix_index)
        for key in sorted(node.children.keys()):
            _, _, child = node.children[key]
            dfs(child)

    dfs(suffix_tree.root)
    return result

# --- Burrows-Wheeler Transform (BWT) ---
def compute_bwt(text, suffix_array):
    text = text + "$"
    bwt = ""
    for i in suffix_array:
        bwt += text[i - 1] if i != 0 else "$"
    return bwt

# --- C Table: First column of sorted BWT matrix ---
def compute_c_table(bwt):
    sorted_bwt = sorted(bwt)
    unique_chars = sorted(set(bwt))
    c_table = {char: sorted_bwt.index(char) for char in unique_chars}
    return c_table

# --- OCC Table: Cumulative counts of characters at each position ---
def compute_occ_table(bwt):
    unique_chars = sorted(set(bwt))
    occ_table = np.zeros((len(unique_chars), len(bwt) + 1), dtype=int)
    char_to_index = {char: i for i, char in enumerate(unique_chars)}

    for i in range(1, len(bwt) + 1):
        occ_table[:, i] = occ_table[:, i - 1]
        char = bwt[i - 1]
        occ_table[char_to_index[char], i] += 1

    return occ_table, char_to_index

# --- FM-index Backward Search ---
def backward_search(pattern, c_table, occ_table, char_to_index, bwt):
    s, e = 1, len(bwt)
    for char in reversed(pattern):
        if char not in char_to_index:
            return []
        idx = char_to_index[char]
        s = c_table[char] + occ_table[idx, s - 1] + 1
        e = c_table[char] + occ_table[idx, e]
        if s > e:
            return []
    return list(range(s - 1, e))

#  Main Program
if __name__ == "__main__":
    reference_genome = "ACGTACGTGCTAGCTAGCTAGCTA"

    # Step 1: Build Suffix Tree and get Suffix Array
    tree = SuffixTree(reference_genome + "$")
    suffix_array = get_suffix_array_from_tree(tree)
    bwt_string = compute_bwt(reference_genome, suffix_array)
    c_table = compute_c_table(bwt_string)
    occ_table, char_to_index = compute_occ_table(bwt_string)

    # Step 2: Print FM-index details
    print("Suffix Array:", suffix_array)
    print("BWT String:", bwt_string)
    print("C Table:", c_table)
    print("Occurrence Table:")
    for char, idx in char_to_index.items():
        print(f"{char}: {occ_table[idx]}")

    # Step 3: Seed-based search and reconstruction
    seeds = [
    "ACGTAC",  # position 0
    "TACGTG",  # position 2
    "TGCTAG",  # position 8
    "AGCTAG",  # position 10
    "AGCTAG",  # position 12
    "TAGCTA",  # position 14
    "AGCTAG",  # position 16
]
    # Reconstruction with proper handling of valid matches only
    reconstruction = ['-' for _ in range(len(reference_genome))]

    # Count occurrences of each seed
    seed_counts = Counter(seeds)

    print("\nPerforming seed-based search and reconstruction:")

    # Iterating over each seed
    for seed in seed_counts:
        print(f"\nSeed: '{seed}'")
        sa_indices = backward_search(seed, c_table, occ_table, char_to_index, bwt_string)
        positions = [suffix_array[i] for i in sa_indices]  # Map indices from suffix array
        print(f"Found at positions: {positions}")

        used = 0
        max_uses = seed_counts[seed]

        for pos in positions:
            if used >= max_uses:
                break  # Ensure we do not use more than the count of the seed

            # Try to insert the seed into the reconstruction array if positions are available
            can_place = True
            for i in range(len(seed)):
                if pos + i >= len(reference_genome):
                    can_place = False  # Out of bounds check
                    break

            if can_place:
                # Place the seed in the reconstruction array (even if it's overlapping)
                for i in range(len(seed)):
                    reconstruction[pos + i] = seed[i]
                used += 1  # Increment only if seed is placed successfully
            else:
                print(f"Warning: Couldn't place seed '{seed}' at position {pos} due to out-of-bounds.")

    # Join the reconstruction array to see the final reconstructed sequence
    reconstructed_query = ''.join(reconstruction)

    # Print the reconstructed sequence
    print("\nReconstructed Read Skeleton from Seeds:")
    print(reconstructed_query)




Suffix Array: [24, 23, 0, 4, 19, 15, 11, 1, 5, 21, 17, 13, 9, 20, 16, 12, 8, 2, 6, 22, 3, 18, 14, 10, 7]
BWT String: AT$TTTTAAGGGGAAATCCCGCCCG
C Table: {'$': 0, 'A': 1, 'C': 7, 'G': 13, 'T': 19}
Occurrence Table:
$: [0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
A: [0 1 1 1 1 1 1 1 2 3 3 3 3 3 4 5 6 6 6 6 6 6 6 6 6 6]
C: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 3 4 5 6 6]
G: [0 0 0 0 0 0 0 0 0 0 1 2 3 4 4 4 4 4 4 4 4 5 5 5 5 6]
T: [0 0 1 1 2 3 4 5 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 6 6 6]

Performing seed-based search and reconstruction:

Seed: 'ACGTAC'
Found at positions: [0]

Seed: 'TACGTG'
Found at positions: [3]

Seed: 'TGCTAG'
Found at positions: [7]

Seed: 'AGCTAG'
Found at positions: [15, 11]

Seed: 'TAGCTA'
Found at positions: [18, 14, 10]

Reconstructed Read Skeleton from Seeds:
ACGTACGTGCTAGCTAGCTAGCTA


In [None]:
import numpy as np
from collections import Counter

# --- Suffix Tree Implementation ---
class SuffixTreeNode:
    def __init__(self):
        self.children = {}
        self.suffix_index = -1

class SuffixTree:
    def __init__(self, text):
        self.text = text
        self.root = SuffixTreeNode()
        self.operation_count = 0
        self.build_suffix_tree()

    def build_suffix_tree(self):
        n = len(self.text)
        for i in range(n):
            self.operation_count += 1
            current = self.root
            j = i
            while j < n:
                self.operation_count += 1
                c = self.text[j]
                if c not in current.children:
                    child = SuffixTreeNode()
                    current.children[c] = (j, n - j, child)
                    child.suffix_index = i
                    break
                else:
                    start, length, child = current.children[c]
                    k = 0
                    while k < length and j + k < n and self.text[start + k] == self.text[j + k]:
                        k += 1
                        self.operation_count += 1
                    if k == length:
                        current = child
                        j += k
                    else:
                        split = SuffixTreeNode()
                        current.children[c] = (start, k, split)
                        split.children[self.text[start + k]] = (start + k, length - k, child)
                        new_leaf = SuffixTreeNode()
                        split.children[self.text[j + k]] = (j + k, n - (j + k), new_leaf)
                        new_leaf.suffix_index = i
                        break

# Suffix Array from Suffix Tree
def get_suffix_array_from_tree(suffix_tree):
    result = []
    operation_count = 0

    def dfs(node):
        nonlocal operation_count
        if node.suffix_index != -1:
            result.append(node.suffix_index)
            operation_count += 1
        for key in sorted(node.children.keys()):
            _, _, child = node.children[key]
            operation_count += 1
            dfs(child)

    dfs(suffix_tree.root)
    return result, operation_count

# --- Burrows-Wheeler Transform (BWT) ---
def compute_bwt(text, suffix_array):
    text = text + "$"
    bwt = ""
    operation_count = 0
    for i in suffix_array:
        operation_count += 1
        bwt += text[i - 1] if i != 0 else "$"
    return bwt, operation_count

# --- C Table: First column of sorted BWT matrix ---
def compute_c_table(bwt):
    sorted_bwt = sorted(bwt)
    unique_chars = sorted(set(bwt))
    operation_count = len(unique_chars)
    c_table = {char: sorted_bwt.index(char) for char in unique_chars}
    return c_table, operation_count

# --- OCC Table: Cumulative counts of characters at each position ---
def compute_occ_table(bwt):
    unique_chars = sorted(set(bwt))
    occ_table = np.zeros((len(unique_chars), len(bwt) + 1), dtype=int)
    char_to_index = {char: i for i, char in enumerate(unique_chars)}
    operation_count = 0

    for i in range(1, len(bwt) + 1):
        occ_table[:, i] = occ_table[:, i - 1]
        char = bwt[i - 1]
        occ_table[char_to_index[char], i] += 1
        operation_count += 1

    return occ_table, char_to_index, operation_count

# --- FM-index Backward Search ---
def backward_search(pattern, c_table, occ_table, char_to_index, bwt):
    s, e = 1, len(bwt)
    operation_count = 0
    for char in reversed(pattern):
        operation_count += 1
        if char not in char_to_index:
            return [], operation_count
        idx = char_to_index[char]
        s = c_table[char] + occ_table[idx, s - 1] + 1
        e = c_table[char] + occ_table[idx, e]
        if s > e:
            return [], operation_count
    return list(range(s - 1, e)), operation_count

# Main Program
if __name__ == "__main__":
    reference_genome = "ACGTACGTGCTAGCTAGCTAGCTA"
    total_ops = {}

    # Step 1: Build Suffix Tree
    tree = SuffixTree(reference_genome + "$")
    total_ops["Suffix Tree Construction"] = tree.operation_count

    # Step 2: Suffix Array
    suffix_array, sa_ops = get_suffix_array_from_tree(tree)
    total_ops["Suffix Array"] = sa_ops

    # Step 3: BWT
    bwt_string, bwt_ops = compute_bwt(reference_genome, suffix_array)
    total_ops["BWT Computation"] = bwt_ops

    # Step 4: C Table
    c_table, c_ops = compute_c_table(bwt_string)
    total_ops["C Table"] = c_ops

    # Step 5: Occurrence Table
    occ_table, char_to_index, occ_ops = compute_occ_table(bwt_string)
    total_ops["Occurrence Table"] = occ_ops

    # Step 6: Reconstruction with seeds
    seeds = [
    "ACGTAC",  # position 0
    "TACGTG",  # position 2
    "TGCTAG",  # position 8
    "AGCTAG",  # position 10
    "AGCTAG",  # position 12
    "TAGCTA",  # position 14
    "AGCTAG", ]
    seed_len = 3
    reconstruction = ['-' for _ in range(len(reference_genome))]
    seed_counts = Counter(seeds)
    reconstruction_ops = 0

    print("\nPerforming seed-based search and reconstruction:")
    for seed in seed_counts:
        print(f"\nSeed: '{seed}'")
        sa_indices, bs_ops = backward_search(seed, c_table, occ_table, char_to_index, bwt_string)
        reconstruction_ops += bs_ops
        positions = [suffix_array[i] for i in sa_indices]
        print(f"Found at positions: {positions}")

        used = 0
        max_uses = seed_counts[seed]
        for pos in positions:
            if used >= max_uses:
                break
            if pos + seed_len <= len(reference_genome):
                reconstruction[pos:pos + seed_len] = list(seed)
                used += 1
                reconstruction_ops += 1

    total_ops["Backward Search + Reconstruction"] = reconstruction_ops

    reconstructed_query = ''.join(reconstruction)
    print("\nReconstructed Read Skeleton from Seeds:")
    print(reconstructed_query)

    print("\nOperation Count Summary:")
    for key, val in total_ops.items():
        print(f"{key}: {val}")



Performing seed-based search and reconstruction:

Seed: 'ACGTAC'
Found at positions: [0]

Seed: 'TACGTG'
Found at positions: [3]

Seed: 'TGCTAG'
Found at positions: [7]

Seed: 'AGCTAG'
Found at positions: [15, 11]

Seed: 'TAGCTA'
Found at positions: [18, 14, 10]

Reconstructed Read Skeleton from Seeds:
ACGTACGTGCTAGCTAG-TAGCTATAG---------------

Operation Count Summary:
Suffix Tree Construction: 159
Suffix Array: 68
BWT Computation: 25
C Table: 5
Occurrence Table: 25
Backward Search + Reconstruction: 36
