Skip to content

Commit

Permalink
Merge pull request #120 from chrislit/modularize
Browse files Browse the repository at this point in the history
Modularize
  • Loading branch information
chrislit committed Oct 19, 2018
2 parents 64abe24 + 58ad882 commit a464fa3
Show file tree
Hide file tree
Showing 238 changed files with 31,659 additions and 26,909 deletions.
3 changes: 0 additions & 3 deletions .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ omit =
*/site-packages/nose/*
*/unittest2/*
*/tests/*
*/__init__.py
parallel = True
branch = True

Expand All @@ -17,5 +16,3 @@ exclude_lines =
pragma: no cover
if __name__ == .__main__.:
if PY3:
omit =
abydos/__init__.py
5 changes: 3 additions & 2 deletions .pyup.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# autogenerated pyup.io config file
# see https://pyup.io/docs/configuration/ for all available options

schedule: every day
update: "insecure"
schedule: "every day"
pin: False
6 changes: 6 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Release History
---------------

0.3.5 (2018-10-20)
++++++++++++++++++

- Refactored library and tests into smaller modules
- Minor bug fixes

0.3.0 (2018-10-15)
++++++++++++++++++

Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,15 @@ Abydos
:target: https://pyup.io/repos/github/chrislit/abydos/
:alt: Updates

.. |pylint| image:: https://img.shields.io/badge/Pylint-9.55/10-green.svg
.. |pylint| image:: https://img.shields.io/badge/Pylint-9.5/10-green.svg
:target: #
:alt: Pylint Score

.. |pycodestyle| image:: https://img.shields.io/badge/pycodestyle-0-brightgreen.svg
:target: #
:alt: pycodestyle Errors

.. |flake8| image:: https://img.shields.io/badge/flake8-40-yellowgreen.svg
.. |flake8| image:: https://img.shields.io/badge/flake8-3-green.svg
:target: #
:alt: flake8 Errors

Expand Down
16 changes: 2 additions & 14 deletions abydos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,5 @@
Abydos NLP/IR library by Christopher C. Little
"""

from .clustering import * # noqa: F403
from .compression import * # noqa: F403
from .corpus import * # noqa: F403
from .distance import * # noqa: F403
from .fingerprint import * # noqa: F403
from .ngram import * # noqa: F403
from .phones import * # noqa: F403
from .phonetic import * # noqa: F403
from .qgram import * # noqa: F403
from .stats import * # noqa: F403
from .stemmer import * # noqa: F403

__all__ = ['clustering', 'compression', 'corpus', 'distance', 'fingerprint',
'ngram', 'phones', 'phonetic', 'qgram', 'stats', 'stemmer']
__all__ = ['compression', 'corpus', 'distance', 'fingerprint', 'phones',
'phonetic', 'stats', 'stemmer', 'tokenizer', 'util']
37 changes: 37 additions & 0 deletions abydos/compression/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.compression.
The compression package defines compression and compression-related functions
for use within Abydos, including implementations of the following:
- arithmetic coding functions (ac_train, ac_encode, & ac_decode)
- Burrows-Wheeler transform encoder/decoder (bwt_encode & bwt_decode)
- Run-Length Encoding encoder/decoder (rle_encode & rle_decode)
"""

from __future__ import unicode_literals


__all__ = ['arithmetic', 'bwt', 'rle']


if __name__ == '__main__':
import doctest
doctest.testmod()
188 changes: 11 additions & 177 deletions abydos/compression.py → abydos/compression/arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,26 @@
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.compression.
"""abydos.compression.arithmetic.
The compression module defines compression and compression-related functions
for use within Abydos, including implementations of the following:
- arithmetic coding functions (ac_train, ac_encode, & ac_decode)
- Burrows-Wheeler transform encoder/decoder (bwt_encode & bwt_decode)
- Run-Length Encoding encoder/decoder (rle_encode & rle_decode)
arithmetic coding functions
"""

from __future__ import division, unicode_literals

from collections import Counter
from fractions import Fraction
from itertools import groupby

from six import PY3, text_type
from six.moves import range


if PY3:
long = int

__all__ = ['ac_decode', 'ac_encode', 'ac_train', 'bwt_decode', 'bwt_encode',
'rle_decode', 'rle_encode']
__all__ = ['decode', 'encode', 'train']


def ac_train(text):
def train(text):
r"""Generate a probability dict from the provided text.
Text -> 0-order probability statistics as a dict
Expand All @@ -57,7 +49,7 @@ def ac_train(text):
:returns: a probability dict
:rtype: dict
>>> ac_train('the quick brown fox jumped over the lazy dog')
>>> train('the quick brown fox jumped over the lazy dog')
{' ': (Fraction(0, 1), Fraction(8, 45)),
'o': (Fraction(8, 45), Fraction(4, 15)),
'e': (Fraction(4, 15), Fraction(16, 45)),
Expand Down Expand Up @@ -107,7 +99,7 @@ def ac_train(text):
return prob_range


def ac_encode(text, probs):
def encode(text, probs):
"""Encode a text using arithmetic coding with the provided probabilities.
Text and the 0-order probability statistics -> longval, nbits
Expand All @@ -123,8 +115,8 @@ def ac_encode(text, probs):
:returns: The arithmetically coded text
:rtype: tuple
>>> pr = ac_train('the quick brown fox jumped over the lazy dog')
>>> ac_encode('align', pr)
>>> pr = train('the quick brown fox jumped over the lazy dog')
>>> encode('align', pr)
(16720586181, 34)
"""
text = text_type(text)
Expand Down Expand Up @@ -157,7 +149,7 @@ def ac_encode(text, probs):
return avg.numerator//avg.denominator, nbits


def ac_decode(longval, nbits, probs):
def decode(longval, nbits, probs):
"""Decode the number to a string using the given statistics.
This is based on Andrew Dalke's public domain implementation
Expand All @@ -170,8 +162,8 @@ def ac_decode(longval, nbits, probs):
:returns: The arithmetically decoded text
:rtype: str
>>> pr = ac_train('the quick brown fox jumped over the lazy dog')
>>> ac_decode(16720586181, 34, pr)
>>> pr = train('the quick brown fox jumped over the lazy dog')
>>> decode(16720586181, 34, pr)
'align'
"""
val = Fraction(longval, long(1) << nbits)
Expand All @@ -193,164 +185,6 @@ def ac_decode(longval, nbits, probs):
return ''.join(letters)


def bwt_encode(word, terminator='\0'):
r"""Return the Burrows-Wheeler transformed form of a word.
The Burrows-Wheeler transform is an attempt at placing similar characters
together to improve compression.
Cf. :cite:`Burrows:1994`.
:param str word: the word to transform using BWT
:param str terminator: a character to add to word to signal the end of the
string
:returns: word encoded by BWT
:rtype: str
>>> bwt_encode('align')
'n\x00ilag'
>>> bwt_encode('banana')
'annb\x00aa'
>>> bwt_encode('banana', '@')
'annb@aa'
"""
if word:
if terminator in word:
raise ValueError('Specified terminator, %s, already in word.'
.format(terminator if
terminator != '\0' else '\\0'))
else:
word += terminator
wordlist = sorted(word[i:] + word[:i] for i in range(len(word)))
return ''.join([w[-1] for w in wordlist])
else:
return terminator


def bwt_decode(code, terminator='\0'):
r"""Return a word decoded from BWT form.
The Burrows-Wheeler transform is an attempt at placing similar characters
together to improve compression. This function reverses the transform.
Cf. :cite:`Burrows:1994`.
:param str code: the word to transform from BWT form
:param str terminator: a character added to word to signal the end of the
string
:returns: word decoded by BWT
:rtype: str
>>> bwt_decode('n\x00ilag')
'align'
>>> bwt_decode('annb\x00aa')
'banana'
>>> bwt_decode('annb@aa', '@')
'banana'
"""
if code:
if terminator not in code:
raise ValueError('Specified terminator, %s, absent from code.'
.format(terminator if
terminator != '\0' else '\\0'))
else:
wordlist = [''] * len(code)
for i in range(len(code)):
wordlist = sorted(code[i] + wordlist[i] for i in
range(len(code)))
rows = [w for w in wordlist if w[-1] == terminator][0]
return rows.rstrip(terminator)
else:
return ''


def rle_encode(text, use_bwt=True):
r"""Perform encoding of run-length-encoding (RLE).
Cf. :cite:`Robinson:1967`.
Based on http://rosettacode.org/wiki/Run-length_encoding#Python
:cite:`rosettacode:2018`. This is licensed GFDL 1.2.
Digits 0-9 cannot be in text.
:param str text: a text string to encode
:param bool use_bwt: boolean indicating whether to perform BWT encoding
before RLE encoding
:returns: word decoded by BWT
:rtype: str
>>> rle_encode('align')
'n\x00ilag'
>>> rle_encode('align', use_bwt=False)
'align'
>>> rle_encode('banana')
'annb\x00aa'
>>> rle_encode('banana', use_bwt=False)
'banana'
>>> rle_encode('aaabaabababa')
'ab\x00abbab5a'
>>> rle_encode('aaabaabababa', False)
'3abaabababa'
"""
if use_bwt:
text = bwt_encode(text)
if text:
text = ((len(list(g)), k) for k, g in groupby(text))
text = ((str(n) + k if n > 2 else (k if n == 1 else 2*k)) for
n, k in text)
return ''.join(text)


def rle_decode(text, use_bwt=True):
r"""Perform decoding of run-length-encoding (RLE).
Cf. :cite:`Robinson:1967`.
Based on http://rosettacode.org/wiki/Run-length_encoding#Python
:cite:`rosettacode:2018`. This is licensed GFDL 1.2.
Digits 0-9 cannot have been in the original text.
:param str text: a text string to decode
:param bool use_bwt: boolean indicating whether to perform BWT decoding
after RLE decoding
:returns: word decoded by BWT
:rtype: str
>>> rle_decode('n\x00ilag')
'align'
>>> rle_decode('align', use_bwt=False)
'align'
>>> rle_decode('annb\x00aa')
'banana'
>>> rle_decode('banana', use_bwt=False)
'banana'
>>> rle_decode('ab\x00abbab5a')
'aaabaabababa'
>>> rle_decode('3abaabababa', False)
'aaabaabababa'
"""
mult = ''
decoded = []
for letter in list(text):
if not letter.isdigit():
if mult:
decoded.append(int(mult)*letter)
mult = ''
else:
decoded.append(letter)
else:
mult += letter

text = ''.join(decoded)
if use_bwt:
text = bwt_decode(text)
return text


if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)

0 comments on commit a464fa3

Please sign in to comment.