init

egonelbre · Dec 12, 2010 · 19aabc2 · 19aabc2
commit 19aabc2
Show file tree

Hide file tree

Showing 5 changed files with 383 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README b/README
@@ -0,0 +1,23 @@
+=========
+FM-index
+=========
+
+This repository contains educational implementations
+of Burrows-Wheeler Tranformation and Ferragina-Manzini
+index.
+
+The implementations are fully commented and the main
+goal is to provide clear implementations of BWT and FM-index.
+
+Speed is considered secondary although there are
+different implementations with different algorithmic
+complexity.
+
+Related Papers
+--------------
+
+* Burrows M and Wheeler D (1994), "A block sorting lossless data compression algorithm"
+* Ferragina P and Manzini G (2000), "Opportunistic data structures with applications"
+* Ferragina P and Manzini G (2001), "An experimental study of an opportunistic index"
+* Ferragina P and Manzini G (2005), "Indexing compressed text"
+* Ferragina P and Manzini G (2001), "An experimental study of a compressed index"
diff --git a/bowtie.py b/bowtie.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+import os, re
+import bwt
+
+class Bowtie:
+    def __init__(self, data):
+        self.orig = data
+        self.offset = {}
+        self.data = bwt.transform(data)
+        self.FM = None
+        self.init_FM()
+
+    def init_FM(self):
+        if self.FM != None: return
+        # left column
+        L = sorted(self.data)
+
+        # A is a index for letter -> L first occurance of letter
+        A = {}
+        last = ""
+        for i, c in enumerate(L):
+            if last != c:
+                A[c] = i
+                last = c
+        del last, L
+
+        # FM Index
+        FM = {}
+        for i, c in enumerate(self.data):
+            for x, v in A.items():
+                FM[(i,x)] = v
+            FM[i] = A[c]
+            A[c] += 1
+        i = len(self.data)
+        for x, v in A.items():
+            FM[(i,x)] = v
+        del A
+
+        self.FM = FM
+
+    def LF(self, idx, qc):
+        return self.FM[(idx,qc)]
+
+    def walk(self, idx):
+        r = 0
+        i = idx
+        while self.data[i] != "\0":
+            if self.offset.get(i):
+                r += self.offset[i]
+                break
+            r += 1
+            i = self.FM[i]
+
+        if not self.offset.get(idx):
+            self.offset[i] = r
+        return r    
+
+    def search(self, q):
+        top = 0
+        bot = len(self.data)
+        for i, qc in enumerate(q[::-1]):
+            top = self.LF(top,qc)
+            bot = self.LF(bot,qc)
+            if top == bot: return []
+        matches = []
+        for i in range(top, bot):
+            matches.append(self.walk(i))
+        return sorted(matches)
+
+b = Bowtie("ACGTCCGTAAAGCAGTCG")
diff --git a/bwt.py b/bwt.py
@@ -0,0 +1,245 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+import os, re
+
+from suffixtree import SuffixTree
+
+class BurrowsWheeler():
+    EOS = "\0"
+    # EOS = "#" # a visible end marker
+
+    def Transform(self, s):
+        """ Simplest Burrows-Wheeler transform implementation, O(n^2) respective
+            to the length of the text. """
+        assert self.EOS not in s, "Input string cannot contain null character ('\0')"
+
+        # add end of text marker
+        s += self.EOS
+
+        # table of rotated input strings
+        rotations = [s[i:] + s[:i] for i in range(len(s))]
+
+        # sort the table of rotations
+        table = sorted(rotations)
+
+        # extract the last characters of each row
+        last_column = [row[-1] for row in table]
+
+        # convert the characters to a string
+        r = "".join(last_column)
+
+        return r
+
+    def Inverse(self, s):
+        """ Simplest Inverse Burrow-Wheeler transform implementation. """
+        # make empty table for the suffix array
+        table = [""] * len(s)
+
+        # use LF-mapping to reverse the tranformation
+        for i in range(len(s)):
+            # add one letter for each partial string in the suffix array
+            prepended = [s[i] + table[i] for i in range(len(s))]
+
+            # convert it to sorted suffix array
+            table = sorted(prepended)
+
+        # Find the correct row (ending in "\0")
+        for row in table:
+            if row.endswith(self.EOS):
+                s = row
+                break
+
+        # Get rid of trailing null character
+        s = s.rstrip(self.EOS)
+
+        return s
+
+# ---------------------------------------------------------------------------- #
+# Different Transform implementations
+# ---------------------------------------------------------------------------- #
+
+class SuffixTreeBurrowsWheeler(BurrowsWheeler):
+
+    def _walk(self, node, len = 0):
+        """ returns the length of suffixes ordered alphabetically """
+        t = []
+        for c, n in sorted(node.items()):
+            if c == 0:
+                t.append(len)
+                continue
+            k = self._walk(n, len + 1)
+            t.extend(k)
+        return t
+
+    def Transform(self, s):
+        """ Burrows-Wheeler transform with SuffixTree """
+        assert "\0" not in s, "Input string cannot contain null character ('\0')"
+
+        # add end of text marker
+        s += self.EOS
+
+        st = SuffixTree()
+
+        # construct a suffix tree O(n * log n)
+        # can also be done in O(n) time
+        st.add(s)
+
+        # walk inorder to find sorted suffixes
+        # only get the length of each suffix
+        lens = self._walk(st.root)
+
+        # as the last column letter will be left of the suffix
+        # this means it's len(suffix) + 1
+        # from the end of the input string s
+
+        r = [0]*len(lens)
+        for i in xrange(len(lens)):
+            l = lens[i]
+            if l == len(lens):
+                r[i] = self.EOS
+            else:
+                r[i] = s[-l-1:-l]
+        return ''.join(r)
+
+# ---------------------------------------------------------------------------- #
+# Different Inverse implementations
+# ---------------------------------------------------------------------------- #
+
+class FastBurrowsWheeler(BurrowsWheeler):
+
+    def Inverse(self, s):
+        """ Inverse Burrow-Wheeler transform based on
+            "A block sorting lossless data compression algorithm"
+            uses LF-mapping for rebuilding the original text.
+            O(n) time, O(n*E) memory """
+
+        # count the letters in input string for calculating the
+        # first occurance of the letter in left column of the sorted
+        # suffix matrix
+        A = {} # letter count
+        for i, c in enumerate(s):
+            if A.get(c):
+                A[c] += 1
+            else:
+                A[c] = 1
+
+        # sort the letters
+        letters = sorted(A.keys())
+
+        occ = {} # first index of letter
+
+        idx = 0
+        for c in letters:
+            occ[c] = idx
+            idx += A[c]
+        del idx, A
+
+        # calculate the LF-mapping
+        # LF is mapping from input letter rank occurance to left letter
+        # this shows for which idx in last column corresponds to the first idx
+        LF = [0] * len(s)
+        for i, c in enumerate(s):
+            LF[i] = occ[c]
+            occ[c] += 1
+        del occ
+
+        # create an empty list for storing the string
+        r = [0]*(len(s)-1)
+        i = 0
+
+        # here we follow the LF mapping until we have the full string
+        for k in xrange(len(r)-1,-1,-1):
+            r[k] = s[i]
+            i = LF[i]
+
+        # convert it to a string
+        r = ''.join(r)
+        return r.rstrip(self.EOS)
+
+
+class CheckpointingBurrowsWheeler(BurrowsWheeler):
+    # how often checkpoints are made
+    def __init__(self, step = 20):
+        self.step = max(1, step)
+
+    def LF(self, s, idx, C, occ):
+        # s - is the transformed text
+        # idx - is the index in the tranformed string
+        # C - is the checkpoint list with step 20
+        # occ - is the first occurance of the letters
+
+        # find the nearest checkpoint for idx
+        check = (idx + (self.step / 2)) / self.step
+        if check >= len(C):
+            check = len(C) - 1
+        pos = check * self.step
+
+        letter = s[idx]
+
+        # count of the letter s[idx] upto pos (not included)
+        count = C[check].get(letter)
+        if count == None: count = 0
+
+        # range between pos and idx
+        if pos < idx:
+            r = xrange(pos, idx)
+        else:
+            r = xrange(idx, pos)
+
+        # count of letters between pos, idx
+        k = 0        
+        for i in r:
+            if letter == s[i]:
+                k += 1
+
+        # calculate the letter count upto idx (not included)
+        if pos < idx:
+            count += k
+        else:
+            count -= k
+
+        # return the appropriate idx
+        return occ[letter] + count
+
+    def Inverse(self, s):
+        """ O(n * (step / 4) + n) time, O(n / step + step * E) memory,
+            where E is the letter count """
+
+        # count the letters in input string for calculating the
+        # first occurance of the letter in left column of the sorted
+        # suffix matrix
+        A = {} # letter count
+        C = [] # checkpoints
+        for i, c in enumerate(s):
+            if i % self.step == 0:
+                C.append(A.copy())
+            if A.get(c):
+                A[c] += 1
+            else:
+                A[c] = 1
+
+        # sort the letters
+        letters = sorted(A.keys())
+
+        # first index of letter in left column
+        occ = {}
+
+        idx = 0
+        for c in letters:
+            occ[c] = idx
+            idx += A[c]
+
+        del A
+
+        # create an empty list for storing the string
+        r = [0]*(len(s)-1)
+        i = 0
+
+        # here we follow the LF mapping until we have the full string
+        for k in xrange(len(r)-1,-1,-1):
+            r[k] = s[i]
+            i = self.LF(s, i, C, occ)
+
+        # convert it to a string
+        r = ''.join(r)
+        return r.rstrip(self.EOS)
diff --git a/suffixtree.py b/suffixtree.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import pprint as pp
+import re
+
+class SuffixTree(object):
+    def __init__(self,*args):
+        self.root = {}
+
+    def _add(self, node, s):
+        if len(s) <= 0:
+            node[0] = ''
+            return
+        c = s[0]
+        if node.has_key(c):
+            self._add(node[c], s[1:])
+        else:
+            node[c] = {}
+            self._add(node[c], s[1:])
+
+    def add(self, s):
+        for i in xrange(len(s)):
+            self._add(self.root, s[i:])
+
+    def __repr__(self):
+        return str(self.root)
+
+    def _strings(self, node, prefix):
+        t = []
+        for c,n in sorted(node.items()):
+            if c == 0:
+                t.append(prefix)
+                continue
+            k = self._strings(n, prefix + c)
+            t.extend(k)
+        return t
+
+    def strings(self):
+        return self._strings(self.root,'')
+
+    def __str__(self):
+        return '\n'.join(self.strings())