Merge pull request #120 from chrislit/modularize

Modularize
chrislit · Oct 19, 2018 · a464fa3 · a464fa3
2 parents 64abe24 + 58ad882
commit a464fa3
Show file tree

Hide file tree

Showing 238 changed files with 31,659 additions and 26,909 deletions.
diff --git a/.coveragerc b/.coveragerc
@@ -8,7 +8,6 @@ omit =
     */site-packages/nose/*
     */unittest2/*
     */tests/*
-    */__init__.py
 parallel = True
 branch = True
 
@@ -17,5 +16,3 @@ exclude_lines =
     pragma: no cover
     if __name__ == .__main__.:
     if PY3:
-omit =
-    abydos/__init__.py
diff --git a/.pyup.yml b/.pyup.yml
@@ -1,4 +1,5 @@
 # autogenerated pyup.io config file 
 # see https://pyup.io/docs/configuration/ for all available options
-
-schedule: every day
+update: "insecure"
+schedule: "every day"
+pin: False
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -1,6 +1,12 @@
 Release History
 ---------------
 
+0.3.5 (2018-10-20)
+++++++++++++++++++
+
+- Refactored library and tests into smaller modules
+- Minor bug fixes
+
 0.3.0 (2018-10-15)
 ++++++++++++++++++
 

diff --git a/README.rst b/README.rst
@@ -72,15 +72,15 @@ Abydos
      :target: https://pyup.io/repos/github/chrislit/abydos/
      :alt: Updates
 
-.. |pylint| image:: https://img.shields.io/badge/Pylint-9.55/10-green.svg
+.. |pylint| image:: https://img.shields.io/badge/Pylint-9.5/10-green.svg
    :target: #
    :alt: Pylint Score
 
 .. |pycodestyle| image:: https://img.shields.io/badge/pycodestyle-0-brightgreen.svg
    :target: #
    :alt: pycodestyle Errors
 
-.. |flake8| image:: https://img.shields.io/badge/flake8-40-yellowgreen.svg
+.. |flake8| image:: https://img.shields.io/badge/flake8-3-green.svg
    :target: #
    :alt: flake8 Errors
 

diff --git a/abydos/__init__.py b/abydos/__init__.py
@@ -21,17 +21,5 @@
 Abydos NLP/IR library by Christopher C. Little
 """
 
-from .clustering import *   # noqa: F403
-from .compression import *  # noqa: F403
-from .corpus import *       # noqa: F403
-from .distance import *     # noqa: F403
-from .fingerprint import *  # noqa: F403
-from .ngram import *        # noqa: F403
-from .phones import *       # noqa: F403
-from .phonetic import *     # noqa: F403
-from .qgram import *        # noqa: F403
-from .stats import *        # noqa: F403
-from .stemmer import *      # noqa: F403
-
-__all__ = ['clustering', 'compression', 'corpus', 'distance', 'fingerprint',
-           'ngram', 'phones', 'phonetic', 'qgram', 'stats', 'stemmer']
+__all__ = ['compression', 'corpus', 'distance', 'fingerprint', 'phones',
+           'phonetic', 'stats', 'stemmer', 'tokenizer', 'util']
diff --git a/abydos/compression/__init__.py b/abydos/compression/__init__.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 by Christopher C. Little.
+# This file is part of Abydos.
+#
+# Abydos is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Abydos is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
+
+"""abydos.compression.
+
+The compression package defines compression and compression-related functions
+for use within Abydos, including implementations of the following:
+
+    - arithmetic coding functions (ac_train, ac_encode, & ac_decode)
+    - Burrows-Wheeler transform encoder/decoder (bwt_encode & bwt_decode)
+    - Run-Length Encoding encoder/decoder (rle_encode & rle_decode)
+"""
+
+from __future__ import unicode_literals
+
+
+__all__ = ['arithmetic', 'bwt', 'rle']
+
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
diff --git a/abydos/compression.py → abydos/compression/arithmetic.py b/abydos/compression.py → abydos/compression/arithmetic.py
@@ -16,34 +16,26 @@
 # You should have received a copy of the GNU General Public License
 # along with Abydos. If not, see <http://www.gnu.org/licenses/>.
 
-"""abydos.compression.
+"""abydos.compression.arithmetic.
 
-The compression module defines compression and compression-related functions
-for use within Abydos, including implementations of the following:
-
-    - arithmetic coding functions (ac_train, ac_encode, & ac_decode)
-    - Burrows-Wheeler transform encoder/decoder (bwt_encode & bwt_decode)
-    - Run-Length Encoding encoder/decoder (rle_encode & rle_decode)
+arithmetic coding functions
 """
 
 from __future__ import division, unicode_literals
 
 from collections import Counter
 from fractions import Fraction
-from itertools import groupby
 
 from six import PY3, text_type
-from six.moves import range
 
 
 if PY3:
     long = int
 
-__all__ = ['ac_decode', 'ac_encode', 'ac_train', 'bwt_decode', 'bwt_encode',
-           'rle_decode', 'rle_encode']
+__all__ = ['decode', 'encode', 'train']
 
 
-def ac_train(text):
+def train(text):
     r"""Generate a probability dict from the provided text.
 
     Text -> 0-order probability statistics as a dict
@@ -57,7 +49,7 @@ def ac_train(text):
     :returns: a probability dict
     :rtype: dict
 
-    >>> ac_train('the quick brown fox jumped over the lazy dog')
+    >>> train('the quick brown fox jumped over the lazy dog')
     {' ': (Fraction(0, 1), Fraction(8, 45)),
      'o': (Fraction(8, 45), Fraction(4, 15)),
      'e': (Fraction(4, 15), Fraction(16, 45)),
@@ -107,7 +99,7 @@ def ac_train(text):
     return prob_range
 
 
-def ac_encode(text, probs):
+def encode(text, probs):
     """Encode a text using arithmetic coding with the provided probabilities.
 
     Text and the 0-order probability statistics -> longval, nbits
@@ -123,8 +115,8 @@ def ac_encode(text, probs):
     :returns: The arithmetically coded text
     :rtype: tuple
 
-    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
-    >>> ac_encode('align', pr)
+    >>> pr = train('the quick brown fox jumped over the lazy dog')
+    >>> encode('align', pr)
     (16720586181, 34)
     """
     text = text_type(text)
@@ -157,7 +149,7 @@ def ac_encode(text, probs):
     return avg.numerator//avg.denominator, nbits
 
 
-def ac_decode(longval, nbits, probs):
+def decode(longval, nbits, probs):
     """Decode the number to a string using the given statistics.
 
     This is based on Andrew Dalke's public domain implementation
@@ -170,8 +162,8 @@ def ac_decode(longval, nbits, probs):
     :returns: The arithmetically decoded text
     :rtype: str
 
-    >>> pr = ac_train('the quick brown fox jumped over the lazy dog')
-    >>> ac_decode(16720586181, 34, pr)
+    >>> pr = train('the quick brown fox jumped over the lazy dog')
+    >>> decode(16720586181, 34, pr)
     'align'
     """
     val = Fraction(longval, long(1) << nbits)
@@ -193,164 +185,6 @@ def ac_decode(longval, nbits, probs):
     return ''.join(letters)
 
 
-def bwt_encode(word, terminator='\0'):
-    r"""Return the Burrows-Wheeler transformed form of a word.
-
-    The Burrows-Wheeler transform is an attempt at placing similar characters
-    together to improve compression.
-    Cf. :cite:`Burrows:1994`.
-
-    :param str word: the word to transform using BWT
-    :param str terminator: a character to add to word to signal the end of the
-        string
-    :returns: word encoded by BWT
-    :rtype: str
-
-    >>> bwt_encode('align')
-    'n\x00ilag'
-    >>> bwt_encode('banana')
-    'annb\x00aa'
-    >>> bwt_encode('banana', '@')
-    'annb@aa'
-    """
-    if word:
-        if terminator in word:
-            raise ValueError('Specified terminator, %s, already in word.'
-                             .format(terminator if
-                                     terminator != '\0' else '\\0'))
-        else:
-            word += terminator
-            wordlist = sorted(word[i:] + word[:i] for i in range(len(word)))
-            return ''.join([w[-1] for w in wordlist])
-    else:
-        return terminator
-
-
-def bwt_decode(code, terminator='\0'):
-    r"""Return a word decoded from BWT form.
-
-    The Burrows-Wheeler transform is an attempt at placing similar characters
-    together to improve compression. This function reverses the transform.
-    Cf. :cite:`Burrows:1994`.
-
-    :param str code: the word to transform from BWT form
-    :param str terminator: a character added to word to signal the end of the
-        string
-    :returns: word decoded by BWT
-    :rtype: str
-
-    >>> bwt_decode('n\x00ilag')
-    'align'
-    >>> bwt_decode('annb\x00aa')
-    'banana'
-    >>> bwt_decode('annb@aa', '@')
-    'banana'
-    """
-    if code:
-        if terminator not in code:
-            raise ValueError('Specified terminator, %s, absent from code.'
-                             .format(terminator if
-                                     terminator != '\0' else '\\0'))
-        else:
-            wordlist = [''] * len(code)
-            for i in range(len(code)):
-                wordlist = sorted(code[i] + wordlist[i] for i in
-                                  range(len(code)))
-            rows = [w for w in wordlist if w[-1] == terminator][0]
-            return rows.rstrip(terminator)
-    else:
-        return ''
-
-
-def rle_encode(text, use_bwt=True):
-    r"""Perform encoding of run-length-encoding (RLE).
-
-    Cf. :cite:`Robinson:1967`.
-
-    Based on http://rosettacode.org/wiki/Run-length_encoding#Python
-    :cite:`rosettacode:2018`. This is licensed GFDL 1.2.
-
-    Digits 0-9 cannot be in text.
-
-    :param str text: a text string to encode
-    :param bool use_bwt: boolean indicating whether to perform BWT encoding
-        before RLE encoding
-    :returns: word decoded by BWT
-    :rtype: str
-
-    >>> rle_encode('align')
-    'n\x00ilag'
-    >>> rle_encode('align', use_bwt=False)
-    'align'
-
-    >>> rle_encode('banana')
-    'annb\x00aa'
-    >>> rle_encode('banana', use_bwt=False)
-    'banana'
-
-    >>> rle_encode('aaabaabababa')
-    'ab\x00abbab5a'
-    >>> rle_encode('aaabaabababa', False)
-    '3abaabababa'
-    """
-    if use_bwt:
-        text = bwt_encode(text)
-    if text:
-        text = ((len(list(g)), k) for k, g in groupby(text))
-        text = ((str(n) + k if n > 2 else (k if n == 1 else 2*k)) for
-                n, k in text)
-    return ''.join(text)
-
-
-def rle_decode(text, use_bwt=True):
-    r"""Perform decoding of run-length-encoding (RLE).
-
-    Cf. :cite:`Robinson:1967`.
-
-    Based on http://rosettacode.org/wiki/Run-length_encoding#Python
-    :cite:`rosettacode:2018`. This is licensed GFDL 1.2.
-
-    Digits 0-9 cannot have been in the original text.
-
-    :param str text: a text string to decode
-    :param bool use_bwt: boolean indicating whether to perform BWT decoding
-        after RLE decoding
-    :returns: word decoded by BWT
-    :rtype: str
-
-    >>> rle_decode('n\x00ilag')
-    'align'
-    >>> rle_decode('align', use_bwt=False)
-    'align'
-
-    >>> rle_decode('annb\x00aa')
-    'banana'
-    >>> rle_decode('banana', use_bwt=False)
-    'banana'
-
-    >>> rle_decode('ab\x00abbab5a')
-    'aaabaabababa'
-    >>> rle_decode('3abaabababa', False)
-    'aaabaabababa'
-    """
-    mult = ''
-    decoded = []
-    for letter in list(text):
-        if not letter.isdigit():
-            if mult:
-                decoded.append(int(mult)*letter)
-                mult = ''
-            else:
-                decoded.append(letter)
-        else:
-            mult += letter
-
-    text = ''.join(decoded)
-    if use_bwt:
-        text = bwt_decode(text)
-    return text
-
-
 if __name__ == '__main__':
     import doctest
     doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)