Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
egonelbre committed Dec 12, 2010
0 parents commit 19aabc2
Show file tree
Hide file tree
Showing 5 changed files with 383 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
*.pyc
23 changes: 23 additions & 0 deletions README
@@ -0,0 +1,23 @@
=========
FM-index
=========

This repository contains educational implementations
of Burrows-Wheeler Tranformation and Ferragina-Manzini
index.

The implementations are fully commented and the main
goal is to provide clear implementations of BWT and FM-index.

Speed is considered secondary although there are
different implementations with different algorithmic
complexity.

Related Papers
--------------

* Burrows M and Wheeler D (1994), "A block sorting lossless data compression algorithm"
* Ferragina P and Manzini G (2000), "Opportunistic data structures with applications"
* Ferragina P and Manzini G (2001), "An experimental study of an opportunistic index"
* Ferragina P and Manzini G (2005), "Indexing compressed text"
* Ferragina P and Manzini G (2001), "An experimental study of a compressed index"
71 changes: 71 additions & 0 deletions bowtie.py
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
#!/usr/bin/python
import os, re
import bwt

class Bowtie:
def __init__(self, data):
self.orig = data
self.offset = {}
self.data = bwt.transform(data)
self.FM = None
self.init_FM()

def init_FM(self):
if self.FM != None: return
# left column
L = sorted(self.data)

# A is a index for letter -> L first occurance of letter
A = {}
last = ""
for i, c in enumerate(L):
if last != c:
A[c] = i
last = c
del last, L

# FM Index
FM = {}
for i, c in enumerate(self.data):
for x, v in A.items():
FM[(i,x)] = v
FM[i] = A[c]
A[c] += 1
i = len(self.data)
for x, v in A.items():
FM[(i,x)] = v
del A

self.FM = FM

def LF(self, idx, qc):
return self.FM[(idx,qc)]

def walk(self, idx):
r = 0
i = idx
while self.data[i] != "\0":
if self.offset.get(i):
r += self.offset[i]
break
r += 1
i = self.FM[i]

if not self.offset.get(idx):
self.offset[i] = r
return r

def search(self, q):
top = 0
bot = len(self.data)
for i, qc in enumerate(q[::-1]):
top = self.LF(top,qc)
bot = self.LF(bot,qc)
if top == bot: return []
matches = []
for i in range(top, bot):
matches.append(self.walk(i))
return sorted(matches)

b = Bowtie("ACGTCCGTAAAGCAGTCG")
245 changes: 245 additions & 0 deletions bwt.py
@@ -0,0 +1,245 @@
# -*- coding: utf-8 -*-
#!/usr/bin/python
import os, re

from suffixtree import SuffixTree

class BurrowsWheeler():
EOS = "\0"
# EOS = "#" # a visible end marker

def Transform(self, s):
""" Simplest Burrows-Wheeler transform implementation, O(n^2) respective
to the length of the text. """
assert self.EOS not in s, "Input string cannot contain null character ('\0')"

# add end of text marker
s += self.EOS

# table of rotated input strings
rotations = [s[i:] + s[:i] for i in range(len(s))]

# sort the table of rotations
table = sorted(rotations)

# extract the last characters of each row
last_column = [row[-1] for row in table]

# convert the characters to a string
r = "".join(last_column)

return r

def Inverse(self, s):
""" Simplest Inverse Burrow-Wheeler transform implementation. """
# make empty table for the suffix array
table = [""] * len(s)

# use LF-mapping to reverse the tranformation
for i in range(len(s)):
# add one letter for each partial string in the suffix array
prepended = [s[i] + table[i] for i in range(len(s))]

# convert it to sorted suffix array
table = sorted(prepended)

# Find the correct row (ending in "\0")
for row in table:
if row.endswith(self.EOS):
s = row
break

# Get rid of trailing null character
s = s.rstrip(self.EOS)

return s

# ---------------------------------------------------------------------------- #
# Different Transform implementations
# ---------------------------------------------------------------------------- #

class SuffixTreeBurrowsWheeler(BurrowsWheeler):

def _walk(self, node, len = 0):
""" returns the length of suffixes ordered alphabetically """
t = []
for c, n in sorted(node.items()):
if c == 0:
t.append(len)
continue
k = self._walk(n, len + 1)
t.extend(k)
return t

def Transform(self, s):
""" Burrows-Wheeler transform with SuffixTree """
assert "\0" not in s, "Input string cannot contain null character ('\0')"

# add end of text marker
s += self.EOS

st = SuffixTree()

# construct a suffix tree O(n * log n)
# can also be done in O(n) time
st.add(s)

# walk inorder to find sorted suffixes
# only get the length of each suffix
lens = self._walk(st.root)

# as the last column letter will be left of the suffix
# this means it's len(suffix) + 1
# from the end of the input string s

r = [0]*len(lens)
for i in xrange(len(lens)):
l = lens[i]
if l == len(lens):
r[i] = self.EOS
else:
r[i] = s[-l-1:-l]
return ''.join(r)

# ---------------------------------------------------------------------------- #
# Different Inverse implementations
# ---------------------------------------------------------------------------- #

class FastBurrowsWheeler(BurrowsWheeler):

def Inverse(self, s):
""" Inverse Burrow-Wheeler transform based on
"A block sorting lossless data compression algorithm"
uses LF-mapping for rebuilding the original text.
O(n) time, O(n*E) memory """

# count the letters in input string for calculating the
# first occurance of the letter in left column of the sorted
# suffix matrix
A = {} # letter count
for i, c in enumerate(s):
if A.get(c):
A[c] += 1
else:
A[c] = 1

# sort the letters
letters = sorted(A.keys())

occ = {} # first index of letter

idx = 0
for c in letters:
occ[c] = idx
idx += A[c]
del idx, A

# calculate the LF-mapping
# LF is mapping from input letter rank occurance to left letter
# this shows for which idx in last column corresponds to the first idx
LF = [0] * len(s)
for i, c in enumerate(s):
LF[i] = occ[c]
occ[c] += 1
del occ

# create an empty list for storing the string
r = [0]*(len(s)-1)
i = 0

# here we follow the LF mapping until we have the full string
for k in xrange(len(r)-1,-1,-1):
r[k] = s[i]
i = LF[i]

# convert it to a string
r = ''.join(r)
return r.rstrip(self.EOS)


class CheckpointingBurrowsWheeler(BurrowsWheeler):
# how often checkpoints are made
def __init__(self, step = 20):
self.step = max(1, step)

def LF(self, s, idx, C, occ):
# s - is the transformed text
# idx - is the index in the tranformed string
# C - is the checkpoint list with step 20
# occ - is the first occurance of the letters

# find the nearest checkpoint for idx
check = (idx + (self.step / 2)) / self.step
if check >= len(C):
check = len(C) - 1
pos = check * self.step

letter = s[idx]

# count of the letter s[idx] upto pos (not included)
count = C[check].get(letter)
if count == None: count = 0

# range between pos and idx
if pos < idx:
r = xrange(pos, idx)
else:
r = xrange(idx, pos)

# count of letters between pos, idx
k = 0
for i in r:
if letter == s[i]:
k += 1

# calculate the letter count upto idx (not included)
if pos < idx:
count += k
else:
count -= k

# return the appropriate idx
return occ[letter] + count

def Inverse(self, s):
""" O(n * (step / 4) + n) time, O(n / step + step * E) memory,
where E is the letter count """

# count the letters in input string for calculating the
# first occurance of the letter in left column of the sorted
# suffix matrix
A = {} # letter count
C = [] # checkpoints
for i, c in enumerate(s):
if i % self.step == 0:
C.append(A.copy())
if A.get(c):
A[c] += 1
else:
A[c] = 1

# sort the letters
letters = sorted(A.keys())

# first index of letter in left column
occ = {}

idx = 0
for c in letters:
occ[c] = idx
idx += A[c]

del A

# create an empty list for storing the string
r = [0]*(len(s)-1)
i = 0

# here we follow the LF mapping until we have the full string
for k in xrange(len(r)-1,-1,-1):
r[k] = s[i]
i = self.LF(s, i, C, occ)

# convert it to a string
r = ''.join(r)
return r.rstrip(self.EOS)
43 changes: 43 additions & 0 deletions suffixtree.py
@@ -0,0 +1,43 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-

import pprint as pp
import re

class SuffixTree(object):
def __init__(self,*args):
self.root = {}

def _add(self, node, s):
if len(s) <= 0:
node[0] = ''
return
c = s[0]
if node.has_key(c):
self._add(node[c], s[1:])
else:
node[c] = {}
self._add(node[c], s[1:])

def add(self, s):
for i in xrange(len(s)):
self._add(self.root, s[i:])

def __repr__(self):
return str(self.root)

def _strings(self, node, prefix):
t = []
for c,n in sorted(node.items()):
if c == 0:
t.append(prefix)
continue
k = self._strings(n, prefix + c)
t.extend(k)
return t

def strings(self):
return self._strings(self.root,'')

def __str__(self):
return '\n'.join(self.strings())

0 comments on commit 19aabc2

Please sign in to comment.