-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 19aabc2
Showing
5 changed files
with
383 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
========= | ||
FM-index | ||
========= | ||
|
||
This repository contains educational implementations | ||
of Burrows-Wheeler Tranformation and Ferragina-Manzini | ||
index. | ||
|
||
The implementations are fully commented and the main | ||
goal is to provide clear implementations of BWT and FM-index. | ||
|
||
Speed is considered secondary although there are | ||
different implementations with different algorithmic | ||
complexity. | ||
|
||
Related Papers | ||
-------------- | ||
|
||
* Burrows M and Wheeler D (1994), "A block sorting lossless data compression algorithm" | ||
* Ferragina P and Manzini G (2000), "Opportunistic data structures with applications" | ||
* Ferragina P and Manzini G (2001), "An experimental study of an opportunistic index" | ||
* Ferragina P and Manzini G (2005), "Indexing compressed text" | ||
* Ferragina P and Manzini G (2001), "An experimental study of a compressed index" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# -*- coding: utf-8 -*- | ||
#!/usr/bin/python | ||
import os, re | ||
import bwt | ||
|
||
class Bowtie: | ||
def __init__(self, data): | ||
self.orig = data | ||
self.offset = {} | ||
self.data = bwt.transform(data) | ||
self.FM = None | ||
self.init_FM() | ||
|
||
def init_FM(self): | ||
if self.FM != None: return | ||
# left column | ||
L = sorted(self.data) | ||
|
||
# A is a index for letter -> L first occurance of letter | ||
A = {} | ||
last = "" | ||
for i, c in enumerate(L): | ||
if last != c: | ||
A[c] = i | ||
last = c | ||
del last, L | ||
|
||
# FM Index | ||
FM = {} | ||
for i, c in enumerate(self.data): | ||
for x, v in A.items(): | ||
FM[(i,x)] = v | ||
FM[i] = A[c] | ||
A[c] += 1 | ||
i = len(self.data) | ||
for x, v in A.items(): | ||
FM[(i,x)] = v | ||
del A | ||
|
||
self.FM = FM | ||
|
||
def LF(self, idx, qc): | ||
return self.FM[(idx,qc)] | ||
|
||
def walk(self, idx): | ||
r = 0 | ||
i = idx | ||
while self.data[i] != "\0": | ||
if self.offset.get(i): | ||
r += self.offset[i] | ||
break | ||
r += 1 | ||
i = self.FM[i] | ||
|
||
if not self.offset.get(idx): | ||
self.offset[i] = r | ||
return r | ||
|
||
def search(self, q): | ||
top = 0 | ||
bot = len(self.data) | ||
for i, qc in enumerate(q[::-1]): | ||
top = self.LF(top,qc) | ||
bot = self.LF(bot,qc) | ||
if top == bot: return [] | ||
matches = [] | ||
for i in range(top, bot): | ||
matches.append(self.walk(i)) | ||
return sorted(matches) | ||
|
||
b = Bowtie("ACGTCCGTAAAGCAGTCG") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
# -*- coding: utf-8 -*- | ||
#!/usr/bin/python | ||
import os, re | ||
|
||
from suffixtree import SuffixTree | ||
|
||
class BurrowsWheeler(): | ||
EOS = "\0" | ||
# EOS = "#" # a visible end marker | ||
|
||
def Transform(self, s): | ||
""" Simplest Burrows-Wheeler transform implementation, O(n^2) respective | ||
to the length of the text. """ | ||
assert self.EOS not in s, "Input string cannot contain null character ('\0')" | ||
|
||
# add end of text marker | ||
s += self.EOS | ||
|
||
# table of rotated input strings | ||
rotations = [s[i:] + s[:i] for i in range(len(s))] | ||
|
||
# sort the table of rotations | ||
table = sorted(rotations) | ||
|
||
# extract the last characters of each row | ||
last_column = [row[-1] for row in table] | ||
|
||
# convert the characters to a string | ||
r = "".join(last_column) | ||
|
||
return r | ||
|
||
def Inverse(self, s): | ||
""" Simplest Inverse Burrow-Wheeler transform implementation. """ | ||
# make empty table for the suffix array | ||
table = [""] * len(s) | ||
|
||
# use LF-mapping to reverse the tranformation | ||
for i in range(len(s)): | ||
# add one letter for each partial string in the suffix array | ||
prepended = [s[i] + table[i] for i in range(len(s))] | ||
|
||
# convert it to sorted suffix array | ||
table = sorted(prepended) | ||
|
||
# Find the correct row (ending in "\0") | ||
for row in table: | ||
if row.endswith(self.EOS): | ||
s = row | ||
break | ||
|
||
# Get rid of trailing null character | ||
s = s.rstrip(self.EOS) | ||
|
||
return s | ||
|
||
# ---------------------------------------------------------------------------- # | ||
# Different Transform implementations | ||
# ---------------------------------------------------------------------------- # | ||
|
||
class SuffixTreeBurrowsWheeler(BurrowsWheeler): | ||
|
||
def _walk(self, node, len = 0): | ||
""" returns the length of suffixes ordered alphabetically """ | ||
t = [] | ||
for c, n in sorted(node.items()): | ||
if c == 0: | ||
t.append(len) | ||
continue | ||
k = self._walk(n, len + 1) | ||
t.extend(k) | ||
return t | ||
|
||
def Transform(self, s): | ||
""" Burrows-Wheeler transform with SuffixTree """ | ||
assert "\0" not in s, "Input string cannot contain null character ('\0')" | ||
|
||
# add end of text marker | ||
s += self.EOS | ||
|
||
st = SuffixTree() | ||
|
||
# construct a suffix tree O(n * log n) | ||
# can also be done in O(n) time | ||
st.add(s) | ||
|
||
# walk inorder to find sorted suffixes | ||
# only get the length of each suffix | ||
lens = self._walk(st.root) | ||
|
||
# as the last column letter will be left of the suffix | ||
# this means it's len(suffix) + 1 | ||
# from the end of the input string s | ||
|
||
r = [0]*len(lens) | ||
for i in xrange(len(lens)): | ||
l = lens[i] | ||
if l == len(lens): | ||
r[i] = self.EOS | ||
else: | ||
r[i] = s[-l-1:-l] | ||
return ''.join(r) | ||
|
||
# ---------------------------------------------------------------------------- # | ||
# Different Inverse implementations | ||
# ---------------------------------------------------------------------------- # | ||
|
||
class FastBurrowsWheeler(BurrowsWheeler): | ||
|
||
def Inverse(self, s): | ||
""" Inverse Burrow-Wheeler transform based on | ||
"A block sorting lossless data compression algorithm" | ||
uses LF-mapping for rebuilding the original text. | ||
O(n) time, O(n*E) memory """ | ||
|
||
# count the letters in input string for calculating the | ||
# first occurance of the letter in left column of the sorted | ||
# suffix matrix | ||
A = {} # letter count | ||
for i, c in enumerate(s): | ||
if A.get(c): | ||
A[c] += 1 | ||
else: | ||
A[c] = 1 | ||
|
||
# sort the letters | ||
letters = sorted(A.keys()) | ||
|
||
occ = {} # first index of letter | ||
|
||
idx = 0 | ||
for c in letters: | ||
occ[c] = idx | ||
idx += A[c] | ||
del idx, A | ||
|
||
# calculate the LF-mapping | ||
# LF is mapping from input letter rank occurance to left letter | ||
# this shows for which idx in last column corresponds to the first idx | ||
LF = [0] * len(s) | ||
for i, c in enumerate(s): | ||
LF[i] = occ[c] | ||
occ[c] += 1 | ||
del occ | ||
|
||
# create an empty list for storing the string | ||
r = [0]*(len(s)-1) | ||
i = 0 | ||
|
||
# here we follow the LF mapping until we have the full string | ||
for k in xrange(len(r)-1,-1,-1): | ||
r[k] = s[i] | ||
i = LF[i] | ||
|
||
# convert it to a string | ||
r = ''.join(r) | ||
return r.rstrip(self.EOS) | ||
|
||
|
||
class CheckpointingBurrowsWheeler(BurrowsWheeler): | ||
# how often checkpoints are made | ||
def __init__(self, step = 20): | ||
self.step = max(1, step) | ||
|
||
def LF(self, s, idx, C, occ): | ||
# s - is the transformed text | ||
# idx - is the index in the tranformed string | ||
# C - is the checkpoint list with step 20 | ||
# occ - is the first occurance of the letters | ||
|
||
# find the nearest checkpoint for idx | ||
check = (idx + (self.step / 2)) / self.step | ||
if check >= len(C): | ||
check = len(C) - 1 | ||
pos = check * self.step | ||
|
||
letter = s[idx] | ||
|
||
# count of the letter s[idx] upto pos (not included) | ||
count = C[check].get(letter) | ||
if count == None: count = 0 | ||
|
||
# range between pos and idx | ||
if pos < idx: | ||
r = xrange(pos, idx) | ||
else: | ||
r = xrange(idx, pos) | ||
|
||
# count of letters between pos, idx | ||
k = 0 | ||
for i in r: | ||
if letter == s[i]: | ||
k += 1 | ||
|
||
# calculate the letter count upto idx (not included) | ||
if pos < idx: | ||
count += k | ||
else: | ||
count -= k | ||
|
||
# return the appropriate idx | ||
return occ[letter] + count | ||
|
||
def Inverse(self, s): | ||
""" O(n * (step / 4) + n) time, O(n / step + step * E) memory, | ||
where E is the letter count """ | ||
|
||
# count the letters in input string for calculating the | ||
# first occurance of the letter in left column of the sorted | ||
# suffix matrix | ||
A = {} # letter count | ||
C = [] # checkpoints | ||
for i, c in enumerate(s): | ||
if i % self.step == 0: | ||
C.append(A.copy()) | ||
if A.get(c): | ||
A[c] += 1 | ||
else: | ||
A[c] = 1 | ||
|
||
# sort the letters | ||
letters = sorted(A.keys()) | ||
|
||
# first index of letter in left column | ||
occ = {} | ||
|
||
idx = 0 | ||
for c in letters: | ||
occ[c] = idx | ||
idx += A[c] | ||
|
||
del A | ||
|
||
# create an empty list for storing the string | ||
r = [0]*(len(s)-1) | ||
i = 0 | ||
|
||
# here we follow the LF mapping until we have the full string | ||
for k in xrange(len(r)-1,-1,-1): | ||
r[k] = s[i] | ||
i = self.LF(s, i, C, occ) | ||
|
||
# convert it to a string | ||
r = ''.join(r) | ||
return r.rstrip(self.EOS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/usr/bin/python | ||
# -*- coding: utf-8 -*- | ||
|
||
import pprint as pp | ||
import re | ||
|
||
class SuffixTree(object): | ||
def __init__(self,*args): | ||
self.root = {} | ||
|
||
def _add(self, node, s): | ||
if len(s) <= 0: | ||
node[0] = '' | ||
return | ||
c = s[0] | ||
if node.has_key(c): | ||
self._add(node[c], s[1:]) | ||
else: | ||
node[c] = {} | ||
self._add(node[c], s[1:]) | ||
|
||
def add(self, s): | ||
for i in xrange(len(s)): | ||
self._add(self.root, s[i:]) | ||
|
||
def __repr__(self): | ||
return str(self.root) | ||
|
||
def _strings(self, node, prefix): | ||
t = [] | ||
for c,n in sorted(node.items()): | ||
if c == 0: | ||
t.append(prefix) | ||
continue | ||
k = self._strings(n, prefix + c) | ||
t.extend(k) | ||
return t | ||
|
||
def strings(self): | ||
return self._strings(self.root,'') | ||
|
||
def __str__(self): | ||
return '\n'.join(self.strings()) |